| Commit | Line | Data |
|---|---|---|
| 92d0a6a6 | 1 | // -*- C++ -*- |
| 4d3e9548 JL |
2 | /* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2009 |
| 3 | Free Software Foundation, Inc. | |
| 92d0a6a6 JR |
4 | Written by James Clark (jjc@jclark.com) |
| 5 | ||
| 6 | This file is part of groff. | |
| 7 | ||
| 8 | groff is free software; you can redistribute it and/or modify it under | |
| 9 | the terms of the GNU General Public License as published by the Free | |
| 4d3e9548 JL |
10 | Software Foundation, either version 3 of the License, or |
| 11 | (at your option) any later version. | |
| 92d0a6a6 JR |
12 | |
| 13 | groff is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
| 15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
| 16 | for more details. | |
| 17 | ||
| 4d3e9548 JL |
18 | You should have received a copy of the GNU General Public License |
| 19 | along with this program. If not, see <http://www.gnu.org/licenses/>. */ | |
| 92d0a6a6 JR |
20 | |
| 21 | #include "refer.h" | |
| 22 | #include "token.h" | |
| 23 | ||
| 24 | #define TOKEN_TABLE_SIZE 1009 | |
| 25 | // I believe in Icelandic thorn sorts after z. | |
| 26 | #define THORN_SORT_KEY "{" | |
| 27 | ||
| 28 | struct token_table_entry { | |
| 29 | const char *tok; | |
| 30 | token_info ti; | |
| 31 | token_table_entry(); | |
| 32 | }; | |
| 33 | ||
| 34 | token_table_entry token_table[TOKEN_TABLE_SIZE]; | |
| 35 | int ntokens = 0; | |
| 36 | ||
| 37 | static void skip_name(const char **ptr, const char *end) | |
| 38 | { | |
| 39 | if (*ptr < end) { | |
| 40 | switch (*(*ptr)++) { | |
| 41 | case '(': | |
| 42 | if (*ptr < end) { | |
| 43 | *ptr += 1; | |
| 44 | if (*ptr < end) | |
| 45 | *ptr += 1; | |
| 46 | } | |
| 47 | break; | |
| 48 | case '[': | |
| 49 | while (*ptr < end) | |
| 50 | if (*(*ptr)++ == ']') | |
| 51 | break; | |
| 52 | break; | |
| 53 | } | |
| 54 | } | |
| 55 | } | |
| 56 | ||
| 57 | int get_token(const char **ptr, const char *end) | |
| 58 | { | |
| 59 | if (*ptr >= end) | |
| 60 | return 0; | |
| 61 | char c = *(*ptr)++; | |
| 62 | if (c == '\\' && *ptr < end) { | |
| 63 | switch (**ptr) { | |
| 64 | default: | |
| 65 | *ptr += 1; | |
| 66 | break; | |
| 67 | case '(': | |
| 68 | case '[': | |
| 69 | skip_name(ptr, end); | |
| 70 | break; | |
| 71 | case '*': | |
| 72 | case 'f': | |
| 73 | *ptr += 1; | |
| 74 | skip_name(ptr, end); | |
| 75 | break; | |
| 76 | } | |
| 77 | } | |
| 78 | return 1; | |
| 79 | } | |
| 80 | ||
| 81 | token_info::token_info() | |
| 82 | : type(TOKEN_OTHER), sort_key(0), other_case(0) | |
| 83 | { | |
| 84 | } | |
| 85 | ||
| 86 | void token_info::set(token_type t, const char *sk, const char *oc) | |
| 87 | { | |
| 88 | assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER); | |
| 89 | type = t; | |
| 90 | sort_key = sk; | |
| 91 | other_case = oc; | |
| 92 | } | |
| 93 | ||
| 94 | void token_info::sortify(const char *start, const char *end, string &result) | |
| 95 | const | |
| 96 | { | |
| 97 | if (sort_key) | |
| 98 | result += sort_key; | |
| 99 | else if (type == TOKEN_UPPER || type == TOKEN_LOWER) { | |
| 100 | for (; start < end; start++) | |
| 101 | if (csalpha(*start)) | |
| 102 | result += cmlower(*start); | |
| 103 | } | |
| 104 | } | |
| 105 | ||
| 106 | int token_info::sortify_non_empty(const char *start, const char *end) const | |
| 107 | { | |
| 108 | if (sort_key) | |
| 109 | return *sort_key != '\0'; | |
| 110 | if (type != TOKEN_UPPER && type != TOKEN_LOWER) | |
| 111 | return 0; | |
| 112 | for (; start < end; start++) | |
| 113 | if (csalpha(*start)) | |
| 114 | return 1; | |
| 115 | return 0; | |
| 116 | } | |
| 117 | ||
| 118 | ||
| 119 | void token_info::lower_case(const char *start, const char *end, | |
| 120 | string &result) const | |
| 121 | { | |
| 122 | if (type != TOKEN_UPPER) { | |
| 123 | while (start < end) | |
| 124 | result += *start++; | |
| 125 | } | |
| 126 | else if (other_case) | |
| 127 | result += other_case; | |
| 128 | else { | |
| 129 | while (start < end) | |
| 130 | result += cmlower(*start++); | |
| 131 | } | |
| 132 | } | |
| 133 | ||
| 134 | void token_info::upper_case(const char *start, const char *end, | |
| 135 | string &result) const | |
| 136 | { | |
| 137 | if (type != TOKEN_LOWER) { | |
| 138 | while (start < end) | |
| 139 | result += *start++; | |
| 140 | } | |
| 141 | else if (other_case) | |
| 142 | result += other_case; | |
| 143 | else { | |
| 144 | while (start < end) | |
| 145 | result += cmupper(*start++); | |
| 146 | } | |
| 147 | } | |
| 148 | ||
| 149 | token_table_entry::token_table_entry() | |
| 150 | : tok(0) | |
| 151 | { | |
| 152 | } | |
| 153 | ||
| 154 | static void store_token(const char *tok, token_type typ, | |
| 155 | const char *sk = 0, const char *oc = 0) | |
| 156 | { | |
| 157 | unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE; | |
| 158 | for (;;) { | |
| 159 | if (token_table[n].tok == 0) { | |
| 160 | if (++ntokens == TOKEN_TABLE_SIZE) | |
| 161 | assert(0); | |
| 162 | token_table[n].tok = tok; | |
| 163 | break; | |
| 164 | } | |
| 165 | if (strcmp(tok, token_table[n].tok) == 0) | |
| 166 | break; | |
| 167 | if (n == 0) | |
| 168 | n = TOKEN_TABLE_SIZE - 1; | |
| 169 | else | |
| 170 | --n; | |
| 171 | } | |
| 172 | token_table[n].ti.set(typ, sk, oc); | |
| 173 | } | |
| 174 | ||
| 175 | ||
| 176 | token_info default_token_info; | |
| 177 | ||
| 178 | const token_info *lookup_token(const char *start, const char *end) | |
| 179 | { | |
| 180 | unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE; | |
| 181 | for (;;) { | |
| 182 | if (token_table[n].tok == 0) | |
| 183 | break; | |
| 184 | if (strlen(token_table[n].tok) == size_t(end - start) | |
| 185 | && memcmp(token_table[n].tok, start, end - start) == 0) | |
| 186 | return &(token_table[n].ti); | |
| 187 | if (n == 0) | |
| 188 | n = TOKEN_TABLE_SIZE - 1; | |
| 189 | else | |
| 190 | --n; | |
| 191 | } | |
| 192 | return &default_token_info; | |
| 193 | } | |
| 194 | ||
| 195 | static void init_ascii() | |
| 196 | { | |
| 197 | const char *p; | |
| 198 | for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) { | |
| 199 | char buf[2]; | |
| 200 | buf[0] = *p; | |
| 201 | buf[1] = '\0'; | |
| 202 | store_token(strsave(buf), TOKEN_LOWER); | |
| 203 | buf[0] = cmupper(buf[0]); | |
| 204 | store_token(strsave(buf), TOKEN_UPPER); | |
| 205 | } | |
| 206 | for (p = "0123456789"; *p; p++) { | |
| 207 | char buf[2]; | |
| 208 | buf[0] = *p; | |
| 209 | buf[1] = '\0'; | |
| 210 | const char *s = strsave(buf); | |
| 211 | store_token(s, TOKEN_OTHER, s); | |
| 212 | } | |
| 213 | for (p = ".,:;?!"; *p; p++) { | |
| 214 | char buf[2]; | |
| 215 | buf[0] = *p; | |
| 216 | buf[1] = '\0'; | |
| 217 | store_token(strsave(buf), TOKEN_PUNCT); | |
| 218 | } | |
| 219 | store_token("-", TOKEN_HYPHEN); | |
| 220 | } | |
| 221 | ||
| 222 | static void store_letter(const char *lower, const char *upper, | |
| 223 | const char *sort_key = 0) | |
| 224 | { | |
| 225 | store_token(lower, TOKEN_LOWER, sort_key, upper); | |
| 226 | store_token(upper, TOKEN_UPPER, sort_key, lower); | |
| 227 | } | |
| 228 | ||
| 229 | static void init_letter(unsigned char uc_code, unsigned char lc_code, | |
| 230 | const char *sort_key) | |
| 231 | { | |
| 232 | char lbuf[2]; | |
| 233 | lbuf[0] = lc_code; | |
| 234 | lbuf[1] = 0; | |
| 235 | char ubuf[2]; | |
| 236 | ubuf[0] = uc_code; | |
| 237 | ubuf[1] = 0; | |
| 238 | store_letter(strsave(lbuf), strsave(ubuf), sort_key); | |
| 239 | } | |
| 240 | ||
| 241 | static void init_latin1() | |
| 242 | { | |
| 243 | init_letter(0xc0, 0xe0, "a"); | |
| 244 | init_letter(0xc1, 0xe1, "a"); | |
| 245 | init_letter(0xc2, 0xe2, "a"); | |
| 246 | init_letter(0xc3, 0xe3, "a"); | |
| 247 | init_letter(0xc4, 0xe4, "a"); | |
| 248 | init_letter(0xc5, 0xe5, "a"); | |
| 249 | init_letter(0xc6, 0xe6, "ae"); | |
| 250 | init_letter(0xc7, 0xe7, "c"); | |
| 251 | init_letter(0xc8, 0xe8, "e"); | |
| 252 | init_letter(0xc9, 0xe9, "e"); | |
| 253 | init_letter(0xca, 0xea, "e"); | |
| 254 | init_letter(0xcb, 0xeb, "e"); | |
| 255 | init_letter(0xcc, 0xec, "i"); | |
| 256 | init_letter(0xcd, 0xed, "i"); | |
| 257 | init_letter(0xce, 0xee, "i"); | |
| 258 | init_letter(0xcf, 0xef, "i"); | |
| 259 | ||
| 260 | init_letter(0xd0, 0xf0, "d"); | |
| 261 | init_letter(0xd1, 0xf1, "n"); | |
| 262 | init_letter(0xd2, 0xf2, "o"); | |
| 263 | init_letter(0xd3, 0xf3, "o"); | |
| 264 | init_letter(0xd4, 0xf4, "o"); | |
| 265 | init_letter(0xd5, 0xf5, "o"); | |
| 266 | init_letter(0xd6, 0xf6, "o"); | |
| 267 | init_letter(0xd8, 0xf8, "o"); | |
| 268 | init_letter(0xd9, 0xf9, "u"); | |
| 269 | init_letter(0xda, 0xfa, "u"); | |
| 270 | init_letter(0xdb, 0xfb, "u"); | |
| 271 | init_letter(0xdc, 0xfc, "u"); | |
| 272 | init_letter(0xdd, 0xfd, "y"); | |
| 273 | init_letter(0xde, 0xfe, THORN_SORT_KEY); | |
| 274 | ||
| 275 | store_token("\337", TOKEN_LOWER, "ss", "SS"); | |
| 276 | store_token("\377", TOKEN_LOWER, "y", "Y"); | |
| 277 | } | |
| 278 | ||
| 279 | static void init_two_char_letter(char l1, char l2, char u1, char u2, | |
| 280 | const char *sk = 0) | |
| 281 | { | |
| 282 | char buf[6]; | |
| 283 | buf[0] = '\\'; | |
| 284 | buf[1] = '('; | |
| 285 | buf[2] = l1; | |
| 286 | buf[3] = l2; | |
| 287 | buf[4] = '\0'; | |
| 288 | const char *p = strsave(buf); | |
| 289 | buf[2] = u1; | |
| 290 | buf[3] = u2; | |
| 291 | store_letter(p, strsave(buf), sk); | |
| 292 | buf[1] = '['; | |
| 293 | buf[4] = ']'; | |
| 294 | buf[5] = '\0'; | |
| 295 | p = strsave(buf); | |
| 296 | buf[2] = l1; | |
| 297 | buf[3] = l2; | |
| 298 | store_letter(strsave(buf), p, sk); | |
| 299 | ||
| 300 | } | |
| 301 | ||
| 302 | static void init_special_chars() | |
| 303 | { | |
| 304 | const char *p; | |
| 305 | for (p = "':^`~"; *p; p++) | |
| 306 | for (const char *q = "aeiouy"; *q; q++) { | |
| 307 | // Use a variable to work around bug in gcc 2.0 | |
| 308 | char c = cmupper(*q); | |
| 309 | init_two_char_letter(*p, *q, *p, c); | |
| 310 | } | |
| 311 | for (p = "/l/o~n,coeaeij"; *p; p += 2) { | |
| 312 | // Use variables to work around bug in gcc 2.0 | |
| 313 | char c0 = cmupper(p[0]); | |
| 314 | char c1 = cmupper(p[1]); | |
| 315 | init_two_char_letter(p[0], p[1], c0, c1); | |
| 316 | } | |
| 317 | init_two_char_letter('v', 's', 'v', 'S', "s"); | |
| 318 | init_two_char_letter('v', 'z', 'v', 'Z', "z"); | |
| 319 | init_two_char_letter('o', 'a', 'o', 'A', "a"); | |
| 320 | init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY); | |
| 321 | init_two_char_letter('-', 'd', '-', 'D'); | |
| 322 | ||
| 323 | store_token("\\(ss", TOKEN_LOWER, 0, "SS"); | |
| 324 | store_token("\\[ss]", TOKEN_LOWER, 0, "SS"); | |
| 325 | ||
| 326 | store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D"); | |
| 327 | store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]"); | |
| 328 | store_token("\\(hy", TOKEN_HYPHEN); | |
| 329 | store_token("\\[hy]", TOKEN_HYPHEN); | |
| 330 | store_token("\\(en", TOKEN_RANGE_SEP); | |
| 331 | store_token("\\[en]", TOKEN_RANGE_SEP); | |
| 332 | } | |
| 333 | ||
| 334 | static void init_strings() | |
| 335 | { | |
| 336 | char buf[6]; | |
| 337 | buf[0] = '\\'; | |
| 338 | buf[1] = '*'; | |
| 339 | for (const char *p = "'`^^,:~v_o./;"; *p; p++) { | |
| 340 | buf[2] = *p; | |
| 341 | buf[3] = '\0'; | |
| 342 | store_token(strsave(buf), TOKEN_ACCENT); | |
| 343 | buf[2] = '['; | |
| 344 | buf[3] = *p; | |
| 345 | buf[4] = ']'; | |
| 346 | buf[5] = '\0'; | |
| 347 | store_token(strsave(buf), TOKEN_ACCENT); | |
| 348 | } | |
| 349 | ||
| 350 | // -ms special letters | |
| 351 | store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY); | |
| 352 | store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY); | |
| 353 | store_letter("\\*(d-", "\\*(D-"); | |
| 354 | store_letter("\\*[d-]", "\\*[D-]"); | |
| 355 | store_letter("\\*(ae", "\\*(Ae", "ae"); | |
| 356 | store_letter("\\*[ae]", "\\*[Ae]", "ae"); | |
| 357 | store_letter("\\*(oe", "\\*(Oe", "oe"); | |
| 358 | store_letter("\\*[oe]", "\\*[Oe]", "oe"); | |
| 359 | ||
| 360 | store_token("\\*3", TOKEN_LOWER, "y", "Y"); | |
| 361 | store_token("\\*8", TOKEN_LOWER, "ss", "SS"); | |
| 362 | store_token("\\*q", TOKEN_LOWER, "o", "O"); | |
| 363 | } | |
| 364 | ||
| 365 | struct token_initer { | |
| 366 | token_initer(); | |
| 367 | }; | |
| 368 | ||
| 369 | static token_initer the_token_initer; | |
| 370 | ||
| 371 | token_initer::token_initer() | |
| 372 | { | |
| 373 | init_ascii(); | |
| 374 | init_latin1(); | |
| 375 | init_special_chars(); | |
| 376 | init_strings(); | |
| 377 | default_token_info.set(TOKEN_OTHER); | |
| 378 | } |