collate: Fix expansion substitions (broken upstream too)
authorJohn Marino <draco@marino.st>
Fri, 23 Oct 2015 12:57:32 +0000 (14:57 +0200)
committerJohn Marino <draco@marino.st>
Fri, 23 Oct 2015 12:57:41 +0000 (14:57 +0200)
Through testing, the user noted that some Cyrillic characters were not
sorting correctly, and this was confirmed.

After extensive testing and review, the localedef tool was eliminated
as the culprit.  The sustitutions were encoded correctly in LC_COLLATE.

The error was mainly in wcscoll where character expansions were
mishandled.  The main directive pass routines had to be written to
go back for a new collation value when the "state" variable was set.
Before pointers were being advanced, the second lookup was gettting
applied to the wrong character, etc.

The "eat expansion codes" section on collate.c also had a bug.  Later
own, the "state" variable logic was changed to only set if next
code was greater than zero (rather than >= 0).

Some additional cleanups got captured from previous work:
1) The previous commit moved the binary search comment from the
   correct location to a wrong location because it's wrong upstream
   in Illumos.  The comment has little value so I just removed it.
2) Don't check if pointers are null before freeing, this is
   redundant as free() handles null pointers.
3) The two binary search trees were standardized wrt initialization
4) On the binary search trees, a negative "high" exits rather than
   checking the table count again.

Reported-by: bapt@FreeBSD.org
diff --git a/lib/libc/locale/collate.c b/lib/libc/locale/collate.c
index 0bd2e1c..6912732 100644
--- a/lib/libc/locale/collate.c
+++ b/lib/libc/locale/collate.c
@@ -216,27 +216,18 @@ substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
  return (p->pri);
 }

-/*
- * Note: for performance reasons, we have expanded bsearch here.  This avoids
- * function call overhead with each comparison.
- */
-
 static collate_chain_t *
 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
 {
- int low;
- int high;
+ int low = 0;
+ int high = table->info->chain_count - 1;;
  int next, compar, l;
  collate_chain_t *p;
- collate_chain_t *tab;
+ collate_chain_t *tab = table->chain_pri_table;

- if (table->info->chain_count == 0)
+ if (high < 0)
  return (NULL);

- low = 0;
- high = table->info->chain_count - 1;
- tab = table->chain_pri_table;
-
  while (low <= high) {
  next = (low + high) / 2;
  p = tab + next;
@@ -266,7 +257,7 @@ largesearch(struct xlocale_collate *table, const wchar_t key)
  collate_large_t *p;
  collate_large_t *tab = table->large_pri_table;

- if (table->info->large_count == 0)
+ if (high < 0)
  return (NULL);

  while (low <= high) {
@@ -310,7 +301,10 @@ _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
  if ((sptr = *state) != NULL) {
  *pri = *sptr;
  sptr++;
- *state = *sptr ? sptr : NULL;
+ if ((sptr == *state) || (sptr == NULL))
+ *state = NULL;
+ else
+ *state = sptr;
  *len = 0;
  return;
  }
@@ -371,7 +365,7 @@ _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
   * code ensures this for us.
   */
  if ((sptr = substsearch(table, *pri, which)) != NULL) {
- if ((*pri = *sptr) != 0) {
+ if ((*pri = *sptr) > 0) {
  sptr++;
  *state = *sptr ? sptr : NULL;
  }
diff --git a/lib/libc/string/wcscoll.c b/lib/libc/string/wcscoll.c
index 87a91c2..c6cb890 100644
--- a/lib/libc/string/wcscoll.c
+++ b/lib/libc/string/wcscoll.c
@@ -74,6 +74,7 @@ wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t locale)
  const int32_t *st2 = NULL;
  const wchar_t *w1 = ws1;
  const wchar_t *w2 = ws2;
+ int check1, check2;

  /* special pass for UNDEFINED */
  if (pass == table->info->directive_count) {
@@ -107,25 +108,36 @@ wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t locale)
  }

  if (direc & DIRECTIVE_POSITION) {
- while ((*w1 || st1) && (*w2 || st2)) {
+ while (*w1 && *w2) {
  pri1 = pri2 = 0;
- _collate_lookup(table, w1, &len1, &pri1, pass,
-     &st1);
- if (pri1 <= 0) {
- if (pri1 < 0) {
- errno = EINVAL;
- goto fail;
+ check1 = check2 = 1;
+ while ((pri1 == pri2) && (check1 || check2)) {
+ if (check1) {
+ _collate_lookup(table, w1, &len1,
+     &pri1, pass, &st1);
+ if (pri1 < 0) {
+ errno = EINVAL;
+ goto fail;
+ }
+ if (!pri1) {
+ pri1 = COLLATE_MAX_PRIORITY;
+ st1 = NULL;
+ }
+ check1 = (st1 != NULL);
  }
- pri1 = COLLATE_MAX_PRIORITY;
- }
- _collate_lookup(table, w2, &len2, &pri2, pass,
-     &st2);
- if (pri2 <= 0) {
- if (pri2 < 0) {
- errno = EINVAL;
- goto fail;
+ if (check2) {
+ _collate_lookup(table, w2, &len2,
+     &pri2, pass, &st2);
+ if (pri2 < 0) {
+ errno = EINVAL;
+ goto fail;
+ }
+ if (!pri2) {
+ pri2 = COLLATE_MAX_PRIORITY;
+ st2 = NULL;
+ }
+ check2 = (st2 != NULL);
  }
- pri2 = COLLATE_MAX_PRIORITY;
  }
  if (pri1 != pri2) {
  ret = pri1 - pri2;
@@ -135,29 +147,38 @@ wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t locale)
  w2 += len2;
  }
  } else {
- while ((*w1 || st1) && (*w2 || st2)) {
+ while (*w1 && *w2) {
  pri1 = pri2 = 0;
- while (*w1) {
- _collate_lookup(table, w1, &len1,
-     &pri1, pass, &st1);
- if (pri1 > 0)
- break;
- if (pri1 < 0) {
- errno = EINVAL;
- goto fail;
+ check1 = check2 = 1;
+ while ((pri1 == pri2) && (check1 || check2)) {
+ while (check1 && *w1) {
+ _collate_lookup(table, w1,
+     &len1, &pri1, pass, &st1);
+ if (pri1 > 0)
+ break;
+ if (pri1 < 0) {
+ errno = EINVAL;
+ goto fail;
+ }
+ st1 = NULL;
+ w1 += 1;
  }
- w1 += len1;
- }
- while (*w2) {
- _collate_lookup(table, w2, &len2,
-     &pri2, pass, &st2);
- if (pri2 > 0)
- break;
- if (pri2 < 0) {
- errno = EINVAL;
- goto fail;
+ check1 = (st1 != NULL);
+ while (check2 && *w2) {
+ _collate_lookup(table, w2,
+     &len2, &pri2, pass, &st2);
+ if (pri2 > 0)
+ break;
+ if (pri2 < 0) {
+ errno = EINVAL;
+ goto fail;
+ }
+ st2 = NULL;
+ w2 += 1;
  }
- w2 += len2;
+ check2 = (st2 != NULL);
+ if (!pri1 || !pri2)
+ break;
  }
  if (!pri1 || !pri2)
  break;
@@ -182,10 +203,8 @@ wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t locale)
  ret = 0;

 end:
- if (tr1)
- free(tr1);
- if (tr2)
- free(tr2);
+ free(tr1);
+ free(tr2);

  return (ret);

lib/libc/locale/collate.c
lib/libc/string/wcscoll.c

index 0bd2e1c..6912732 100644 (file)
@@ -216,27 +216,18 @@ substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
        return (p->pri);
 }
 
-/*
- * Note: for performance reasons, we have expanded bsearch here.  This avoids
- * function call overhead with each comparison.
- */
-
 static collate_chain_t *
 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
 {
-       int low;
-       int high;
+       int low = 0;
+       int high = table->info->chain_count - 1;;
        int next, compar, l;
        collate_chain_t *p;
-       collate_chain_t *tab;
+       collate_chain_t *tab = table->chain_pri_table;
 
-       if (table->info->chain_count == 0)
+       if (high < 0)
                return (NULL);
 
-       low = 0;
-       high = table->info->chain_count - 1;
-       tab = table->chain_pri_table;
-
        while (low <= high) {
                next = (low + high) / 2;
                p = tab + next;
@@ -266,7 +257,7 @@ largesearch(struct xlocale_collate *table, const wchar_t key)
        collate_large_t *p;
        collate_large_t *tab = table->large_pri_table;
 
-       if (table->info->large_count == 0)
+       if (high < 0)
                return (NULL);
 
        while (low <= high) {
@@ -310,7 +301,10 @@ _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
        if ((sptr = *state) != NULL) {
                *pri = *sptr;
                sptr++;
-               *state = *sptr ? sptr : NULL;
+               if ((sptr == *state) || (sptr == NULL))
+                       *state = NULL;
+               else
+                       *state = sptr;
                *len = 0;
                return;
        }
@@ -371,7 +365,7 @@ _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
         * code ensures this for us.
         */
        if ((sptr = substsearch(table, *pri, which)) != NULL) {
-               if ((*pri = *sptr) != 0) {
+               if ((*pri = *sptr) > 0) {
                        sptr++;
                        *state = *sptr ? sptr : NULL;
                }
index 87a91c2..c6cb890 100644 (file)
@@ -74,6 +74,7 @@ wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t locale)
                const int32_t *st2 = NULL;
                const wchar_t   *w1 = ws1;
                const wchar_t   *w2 = ws2;
+               int check1, check2;
 
                /* special pass for UNDEFINED */
                if (pass == table->info->directive_count) {
@@ -107,25 +108,36 @@ wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t locale)
                }
 
                if (direc & DIRECTIVE_POSITION) {
-                       while ((*w1 || st1) && (*w2 || st2)) {
+                       while (*w1 && *w2) {
                                pri1 = pri2 = 0;
-                               _collate_lookup(table, w1, &len1, &pri1, pass,
-                                   &st1);
-                               if (pri1 <= 0) {
-                                       if (pri1 < 0) {
-                                               errno = EINVAL;
-                                               goto fail;
+                               check1 = check2 = 1;
+                               while ((pri1 == pri2) && (check1 || check2)) {
+                                       if (check1) {
+                                               _collate_lookup(table, w1, &len1,
+                                                   &pri1, pass, &st1);
+                                               if (pri1 < 0) {
+                                                       errno = EINVAL;
+                                                       goto fail;
+                                               }
+                                               if (!pri1) {
+                                                       pri1 = COLLATE_MAX_PRIORITY;
+                                                       st1 = NULL;
+                                               }
+                                               check1 = (st1 != NULL);
                                        }
-                                       pri1 = COLLATE_MAX_PRIORITY;
-                               }
-                               _collate_lookup(table, w2, &len2, &pri2, pass,
-                                   &st2);
-                               if (pri2 <= 0) {
-                                       if (pri2 < 0) {
-                                               errno = EINVAL;
-                                               goto fail;
+                                       if (check2) {
+                                               _collate_lookup(table, w2, &len2,
+                                                   &pri2, pass, &st2);
+                                               if (pri2 < 0) {
+                                                       errno = EINVAL;
+                                                       goto fail;
+                                               }
+                                               if (!pri2) {
+                                                       pri2 = COLLATE_MAX_PRIORITY;
+                                                       st2 = NULL;
+                                               }
+                                               check2 = (st2 != NULL);
                                        }
-                                       pri2 = COLLATE_MAX_PRIORITY;
                                }
                                if (pri1 != pri2) {
                                        ret = pri1 - pri2;
@@ -135,29 +147,38 @@ wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t locale)
                                w2 += len2;
                        }
                } else {
-                       while ((*w1 || st1) && (*w2 || st2)) {
+                       while (*w1 && *w2) {
                                pri1 = pri2 = 0;
-                               while (*w1) {
-                                       _collate_lookup(table, w1, &len1,
-                                           &pri1, pass, &st1);
-                                       if (pri1 > 0)
-                                               break;
-                                       if (pri1 < 0) {
-                                               errno = EINVAL;
-                                               goto fail;
+                               check1 = check2 = 1;
+                               while ((pri1 == pri2) && (check1 || check2)) {
+                                       while (check1 && *w1) {
+                                               _collate_lookup(table, w1,
+                                                   &len1, &pri1, pass, &st1);
+                                               if (pri1 > 0)
+                                                       break;
+                                               if (pri1 < 0) {
+                                                       errno = EINVAL;
+                                                       goto fail;
+                                               }
+                                               st1 = NULL;
+                                               w1 += 1;
                                        }
-                                       w1 += len1;
-                               }
-                               while (*w2) {
-                                       _collate_lookup(table, w2, &len2,
-                                           &pri2, pass, &st2);
-                                       if (pri2 > 0)
-                                               break;
-                                       if (pri2 < 0) {
-                                               errno = EINVAL;
-                                               goto fail;
+                                       check1 = (st1 != NULL);
+                                       while (check2 && *w2) {
+                                               _collate_lookup(table, w2,
+                                                   &len2, &pri2, pass, &st2);
+                                               if (pri2 > 0)
+                                                       break;
+                                               if (pri2 < 0) {
+                                                       errno = EINVAL;
+                                                       goto fail;
+                                               }
+                                               st2 = NULL;
+                                               w2 += 1;
                                        }
-                                       w2 += len2;
+                                       check2 = (st2 != NULL);
+                                       if (!pri1 || !pri2)
+                                               break;
                                }
                                if (!pri1 || !pri2)
                                        break;
@@ -182,10 +203,8 @@ wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t locale)
        ret = 0;
 
 end:
-       if (tr1)
-               free(tr1);
-       if (tr2)
-               free(tr2);
+       free(tr1);
+       free(tr2);
 
        return (ret);