1 /* searchutils.c - helper subroutines for grep's matchers.
2 Copyright 1992, 1998, 2000, 2007, 2009-2012 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
23 #define NCHAR (UCHAR_MAX + 1)
26 kwsinit (kwset_t *kwset)
28 static char trans[NCHAR];
31 if (match_icase && MB_CUR_MAX == 1)
33 for (i = 0; i < NCHAR; ++i)
34 trans[i] = tolower (i);
36 *kwset = kwsalloc (trans);
39 *kwset = kwsalloc (NULL);
46 /* Convert the *N-byte string, BEG, to lower-case, and write the
47 NUL-terminated result into malloc'd storage. Upon success, set *N
48 to the length (in bytes) of the resulting string (not including the
49 trailing NUL byte), and return a pointer to the lower-case string.
50 Upon memory allocation failure, this function exits.
51 Note that on input, *N must be larger than zero.
53 Note that while this function returns a pointer to malloc'd storage,
54 the caller must not free it, since this function retains a pointer
55 to the buffer and reuses it on any subsequent call. As a consequence,
56 this function is not thread-safe.
58 When each character in the lower-case result string has the same length
59 as the corresponding character in the input string, set *LEN_MAP_P
60 to NULL. Otherwise, set it to a malloc'd buffer (like the returned
61 buffer, this must not be freed by caller) of the same length as the
62 result string. (*LEN_MAP_P)[J] is the change in byte-length of the
63 character in BEG that formed byte J of the result as it was converted to
64 lower-case. It is usually zero. For the upper-case Turkish I-with-dot
65 it is -1, since the upper-case character occupies two bytes, while the
66 lower-case one occupies only one byte. For the Turkish-I-without-dot
67 in the tr_TR.utf8 locale, it is 1 because the lower-case representation
68 is one byte longer than the original. When that happens, we have two
69 or more slots in *LEN_MAP_P for each such character. We store the
70 difference in the first one and 0's in any remaining slots.
72 This map is used by the caller to convert offset,length pairs that
73 reference the lower-case result to numbers that refer to the matched
74 part of the original buffer. */
77 mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p)
80 static mb_len_map_t *len_map;
81 static size_t outalloc;
82 size_t outlen, mb_cur_max;
87 bool lengths_differ = false;
89 if (*n > outalloc || outalloc == 0)
91 outalloc = MAX(1, *n);
92 out = xrealloc (out, outalloc);
93 len_map = xrealloc (len_map, outalloc);
96 /* appease clang-2.6 */
102 memset (&is, 0, sizeof (is));
103 memset (&os, 0, sizeof (os));
106 mb_cur_max = MB_CUR_MAX;
113 size_t mbclen = mbrtowc (&wc, beg, end - beg, &is);
114 if (outlen + mb_cur_max >= outalloc)
116 size_t dm = m - len_map;
117 out = x2nrealloc (out, &outalloc, 1);
118 len_map = xrealloc (len_map, outalloc);
123 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
125 /* An invalid sequence, or a truncated multi-octet character.
126 We treat it as a single-octet character. */
130 memset (&is, 0, sizeof (is));
131 memset (&os, 0, sizeof (os));
136 size_t ombclen = wcrtomb (p, towlower ((wint_t) wc), &os);
137 *m = mbclen - ombclen;
138 memset (m + 1, 0, ombclen - 1);
142 lengths_differ |= (mbclen != ombclen);
146 *len_map_p = lengths_differ ? len_map : NULL;
154 is_mb_middle (const char **good, const char *buf, const char *end,
157 const char *p = *good;
158 const char *prev = p;
161 /* TODO: can be optimized for UTF-8. */
162 memset(&cur_state, 0, sizeof(mbstate_t));
165 size_t mbclen = mbrlen(p, end - p, &cur_state);
167 /* Store the beginning of the previous complete multibyte character. */
168 if (mbclen != (size_t) -2)
171 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
173 /* An invalid sequence, or a truncated multibyte character.
174 We treat it as a single byte character. */
176 memset(&cur_state, 0, sizeof cur_state);
187 return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state);
189 #endif /* MBS_SUPPORT */