groff: update vendor branch to v1.20.1
[dragonfly.git] / contrib / groff / src / libs / libbib / index.cpp
CommitLineData
92d0a6a6 1// -*- C++ -*-
4d3e9548 2/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2004, 2009
92d0a6a6
JR
3 Free Software Foundation, Inc.
4 Written by James Clark (jjc@jclark.com)
5
6This file is part of groff.
7
8groff is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
4d3e9548
JL
10Software Foundation, either version 3 of the License, or
11(at your option) any later version.
92d0a6a6
JR
12
13groff is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
4d3e9548
JL
18You should have received a copy of the GNU General Public License
19along with this program. If not, see <http://www.gnu.org/licenses/>. */
92d0a6a6
JR
20
21#include "lib.h"
22
23#include <stdlib.h>
24#include <errno.h>
25
26#include "posix.h"
27#include "cset.h"
28#include "cmap.h"
29#include "errarg.h"
30#include "error.h"
31
32#include "refid.h"
33#include "search.h"
34#include "index.h"
35#include "defs.h"
36
37#include "nonposix.h"
38
39// Interface to mmap.
40extern "C" {
41 void *mapread(int fd, int len);
42 int unmap(void *, int len);
43}
44
45#if 0
46const
47#endif
48int minus_one = -1;
49
50int verify_flag = 0;
51
52struct word_list;
53
54class index_search_item : public search_item {
55 search_item *out_of_date_files;
56 index_header header;
57 char *buffer;
58 void *map_addr;
59 int map_len;
60 tag *tags;
61 int *table;
62 int *lists;
63 char *pool;
64 char *key_buffer;
65 char *filename_buffer;
66 int filename_buflen;
67 char **common_words_table;
68 int common_words_table_size;
69 const char *ignore_fields;
70 time_t mtime;
71
72 const char *do_verify();
73 const int *search1(const char **pp, const char *end);
74 const int *search(const char *ptr, int length, int **temp_listp);
75 const char *munge_filename(const char *);
76 void read_common_words_file();
77 void add_out_of_date_file(int fd, const char *filename, int fid);
78public:
79 index_search_item(const char *, int);
80 ~index_search_item();
81 int load(int fd);
82 search_item_iterator *make_search_item_iterator(const char *);
83 int verify();
84 void check_files();
85 int next_filename_id() const;
86 friend class index_search_item_iterator;
87};
88
89class index_search_item_iterator : public search_item_iterator {
90 index_search_item *indx;
91 search_item_iterator *out_of_date_files_iter;
92 search_item *next_out_of_date_file;
93 const int *found_list;
94 int *temp_list;
95 char *buf;
96 int buflen;
97 linear_searcher searcher;
98 char *query;
99 int get_tag(int tagno, const linear_searcher &, const char **, int *,
100 reference_id *);
101public:
102 index_search_item_iterator(index_search_item *, const char *);
103 ~index_search_item_iterator();
104 int next(const linear_searcher &, const char **, int *, reference_id *);
105};
106
107
108index_search_item::index_search_item(const char *filename, int fid)
109: search_item(filename, fid), out_of_date_files(0), buffer(0), map_addr(0),
110 map_len(0), key_buffer(0), filename_buffer(0), filename_buflen(0),
111 common_words_table(0)
112{
113}
114
115index_search_item::~index_search_item()
116{
117 if (buffer)
118 free(buffer);
119 if (map_addr) {
120 if (unmap(map_addr, map_len) < 0)
121 error("unmap: %1", strerror(errno));
122 }
123 while (out_of_date_files) {
124 search_item *tem = out_of_date_files;
125 out_of_date_files = out_of_date_files->next;
126 delete tem;
127 }
128 a_delete filename_buffer;
129 a_delete key_buffer;
130 if (common_words_table) {
131 for (int i = 0; i < common_words_table_size; i++)
132 a_delete common_words_table[i];
133 a_delete common_words_table;
134 }
135}
136
137class file_closer {
138 int *fdp;
139public:
140 file_closer(int &fd) : fdp(&fd) { }
141 ~file_closer() { close(*fdp); }
142};
143
144// Tell the compiler that a variable is intentionally unused.
145inline void unused(void *) { }
146
147int index_search_item::load(int fd)
148{
149 file_closer fd_closer(fd); // close fd on return
150 unused(&fd_closer);
151 struct stat sb;
152 if (fstat(fd, &sb) < 0) {
153 error("can't fstat `%1': %2", name, strerror(errno));
154 return 0;
155 }
156 if (!S_ISREG(sb.st_mode)) {
157 error("`%1' is not a regular file", name);
158 return 0;
159 }
160 mtime = sb.st_mtime;
161 int size = int(sb.st_size);
162 char *addr;
163 map_addr = mapread(fd, size);
164 if (map_addr) {
165 addr = (char *)map_addr;
166 map_len = size;
167 }
168 else {
169 addr = buffer = (char *)malloc(size);
170 if (buffer == 0) {
171 error("can't allocate buffer for `%1'", name);
172 return 0;
173 }
174 char *ptr = buffer;
175 int bytes_to_read = size;
176 while (bytes_to_read > 0) {
177 int nread = read(fd, ptr, bytes_to_read);
178 if (nread == 0) {
179 error("unexpected EOF on `%1'", name);
180 return 0;
181 }
182 if (nread < 0) {
183 error("read error on `%1': %2", name, strerror(errno));
184 return 0;
185 }
186 bytes_to_read -= nread;
187 ptr += nread;
188 }
189 }
190 header = *(index_header *)addr;
191 if (header.magic != INDEX_MAGIC) {
192 error("`%1' is not an index file: wrong magic number", name);
193 return 0;
194 }
195 if (header.version != INDEX_VERSION) {
196 error("version number in `%1' is wrong: was %2, should be %3",
197 name, header.version, INDEX_VERSION);
198 return 0;
199 }
200 int sz = (header.tags_size * sizeof(tag)
201 + header.lists_size * sizeof(int)
202 + header.table_size * sizeof(int)
203 + header.strings_size
204 + sizeof(header));
205 if (sz != size) {
206 error("size of `%1' is wrong: was %2, should be %3",
207 name, size, sz);
208 return 0;
209 }
210 tags = (tag *)(addr + sizeof(header));
211 lists = (int *)(tags + header.tags_size);
212 table = (int *)(lists + header.lists_size);
213 pool = (char *)(table + header.table_size);
214 ignore_fields = strchr(strchr(pool, '\0') + 1, '\0') + 1;
215 key_buffer = new char[header.truncate];
216 read_common_words_file();
217 return 1;
218}
219
220const char *index_search_item::do_verify()
221{
222 if (tags == 0)
223 return "not loaded";
224 if (lists[header.lists_size - 1] >= 0)
225 return "last list element not negative";
226 int i;
227 for (i = 0; i < header.table_size; i++) {
228 int li = table[i];
229 if (li >= header.lists_size)
230 return "bad list index";
231 if (li >= 0) {
232 for (int *ptr = lists + li; *ptr >= 0; ptr++) {
233 if (*ptr >= header.tags_size)
234 return "bad tag index";
235 if (*ptr >= ptr[1] && ptr[1] >= 0)
236 return "list not ordered";
237 }
238 }
239 }
240 for (i = 0; i < header.tags_size; i++) {
241 if (tags[i].filename_index >= header.strings_size)
242 return "bad index in tags";
243 if (tags[i].length < 0)
244 return "bad length in tags";
245 if (tags[i].start < 0)
246 return "bad start in tags";
247 }
248 if (pool[header.strings_size - 1] != '\0')
249 return "last character in pool not nul";
250 return 0;
251}
252
253int index_search_item::verify()
254{
255 const char *reason = do_verify();
256 if (!reason)
257 return 1;
258 error("`%1' is bad: %2", name, reason);
259 return 0;
260}
261
262int index_search_item::next_filename_id() const
263{
264 return filename_id + header.strings_size + 1;
265}
266
267search_item_iterator *index_search_item::make_search_item_iterator(
268 const char *query)
269{
270 return new index_search_item_iterator(this, query);
271}
272
273search_item *make_index_search_item(const char *filename, int fid)
274{
275 char *index_filename = new char[strlen(filename) + sizeof(INDEX_SUFFIX)];
276 strcpy(index_filename, filename);
277 strcat(index_filename, INDEX_SUFFIX);
278 int fd = open(index_filename, O_RDONLY | O_BINARY);
279 if (fd < 0)
280 return 0;
281 index_search_item *item = new index_search_item(index_filename, fid);
282 a_delete index_filename;
283 if (!item->load(fd)) {
284 close(fd);
285 delete item;
286 return 0;
287 }
288 else if (verify_flag && !item->verify()) {
289 delete item;
290 return 0;
291 }
292 else {
293 item->check_files();
294 return item;
295 }
296}
297
298
299index_search_item_iterator::index_search_item_iterator(index_search_item *ind,
300 const char *q)
301: indx(ind), out_of_date_files_iter(0), next_out_of_date_file(0), temp_list(0),
302 buf(0), buflen(0),
303 searcher(q, strlen(q), ind->ignore_fields, ind->header.truncate),
304 query(strsave(q))
305{
306 found_list = indx->search(q, strlen(q), &temp_list);
307 if (!found_list) {
308 found_list = &minus_one;
309 warning("all keys would have been discarded in constructing index `%1'",
310 indx->name);
311 }
312}
313
314index_search_item_iterator::~index_search_item_iterator()
315{
316 a_delete temp_list;
317 a_delete buf;
318 a_delete query;
319 delete out_of_date_files_iter;
320}
321
322int index_search_item_iterator::next(const linear_searcher &,
323 const char **pp, int *lenp,
324 reference_id *ridp)
325{
326 if (found_list) {
327 for (;;) {
328 int tagno = *found_list;
329 if (tagno == -1)
330 break;
331 found_list++;
332 if (get_tag(tagno, searcher, pp, lenp, ridp))
333 return 1;
334 }
335 found_list = 0;
336 next_out_of_date_file = indx->out_of_date_files;
337 }
338 while (next_out_of_date_file) {
339 if (out_of_date_files_iter == 0)
340 out_of_date_files_iter
341 = next_out_of_date_file->make_search_item_iterator(query);
342 if (out_of_date_files_iter->next(searcher, pp, lenp, ridp))
343 return 1;
344 delete out_of_date_files_iter;
345 out_of_date_files_iter = 0;
346 next_out_of_date_file = next_out_of_date_file->next;
347 }
348 return 0;
349}
350
351int index_search_item_iterator::get_tag(int tagno,
352 const linear_searcher &searchr,
353 const char **pp, int *lenp,
354 reference_id *ridp)
355{
356 if (tagno < 0 || tagno >= indx->header.tags_size) {
357 error("bad tag number");
358 return 0;
359 }
360 tag *tp = indx->tags + tagno;
361 const char *filename = indx->munge_filename(indx->pool + tp->filename_index);
362 int fd = open(filename, O_RDONLY | O_BINARY);
363 if (fd < 0) {
364 error("can't open `%1': %2", filename, strerror(errno));
365 return 0;
366 }
367 struct stat sb;
368 if (fstat(fd, &sb) < 0) {
369 error("can't fstat: %1", strerror(errno));
370 close(fd);
371 return 0;
372 }
373 time_t mtime = sb.st_mtime;
374 if (mtime > indx->mtime) {
375 indx->add_out_of_date_file(fd, filename,
376 indx->filename_id + tp->filename_index);
377 return 0;
378 }
379 int res = 0;
380 FILE *fp = fdopen(fd, FOPEN_RB);
381 if (!fp) {
382 error("fdopen failed");
383 close(fd);
384 return 0;
385 }
386 if (tp->start != 0 && fseek(fp, long(tp->start), 0) < 0)
387 error("can't seek on `%1': %2", filename, strerror(errno));
388 else {
389 int length = tp->length;
390 int err = 0;
391 if (length == 0) {
392 if (fstat(fileno(fp), &sb) < 0) {
393 error("can't stat `%1': %2", filename, strerror(errno));
394 err = 1;
395 }
396 else if (!S_ISREG(sb.st_mode)) {
397 error("`%1' is not a regular file", filename);
398 err = 1;
399 }
400 else
401 length = int(sb.st_size);
402 }
403 if (!err) {
404 if (length + 2 > buflen) {
405 a_delete buf;
406 buflen = length + 2;
407 buf = new char[buflen];
408 }
409 if (fread(buf + 1, 1, length, fp) != (size_t)length)
410 error("fread on `%1' failed: %2", filename, strerror(errno));
411 else {
412 buf[0] = '\n';
413 // Remove the CR characters from CRLF pairs.
414 int sidx = 1, didx = 1;
415 for ( ; sidx < length + 1; sidx++, didx++)
416 {
417 if (buf[sidx] == '\r')
418 {
419 if (buf[++sidx] != '\n')
420 buf[didx++] = '\r';
421 else
422 length--;
423 }
424 if (sidx != didx)
425 buf[didx] = buf[sidx];
426 }
427 buf[length + 1] = '\n';
428 res = searchr.search(buf + 1, buf + 2 + length, pp, lenp);
429 if (res && ridp)
430 *ridp = reference_id(indx->filename_id + tp->filename_index,
431 tp->start);
432 }
433 }
434 }
435 fclose(fp);
436 return res;
437}
438
439const char *index_search_item::munge_filename(const char *filename)
440{
441 if (IS_ABSOLUTE(filename))
442 return filename;
443 const char *cwd = pool;
444 int need_slash = (cwd[0] != 0
445 && strchr(DIR_SEPS, strchr(cwd, '\0')[-1]) == 0);
446 int len = strlen(cwd) + strlen(filename) + need_slash + 1;
447 if (len > filename_buflen) {
448 a_delete filename_buffer;
449 filename_buflen = len;
450 filename_buffer = new char[len];
451 }
452 strcpy(filename_buffer, cwd);
453 if (need_slash)
454 strcat(filename_buffer, "/");
455 strcat(filename_buffer, filename);
456 return filename_buffer;
457}
458
459const int *index_search_item::search1(const char **pp, const char *end)
460{
461 while (*pp < end && !csalnum(**pp))
462 *pp += 1;
463 if (*pp >= end)
464 return 0;
465 const char *start = *pp;
466 while (*pp < end && csalnum(**pp))
467 *pp += 1;
468 int len = *pp - start;
469 if (len < header.shortest)
470 return 0;
471 if (len > header.truncate)
472 len = header.truncate;
473 int is_number = 1;
474 for (int i = 0; i < len; i++)
475 if (csdigit(start[i]))
476 key_buffer[i] = start[i];
477 else {
478 key_buffer[i] = cmlower(start[i]);
479 is_number = 0;
480 }
481 if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9'))
482 return 0;
483 unsigned hc = hash(key_buffer, len);
484 if (common_words_table) {
485 for (int h = hc % common_words_table_size;
486 common_words_table[h];
487 --h) {
488 if (strlen(common_words_table[h]) == (size_t)len
489 && memcmp(common_words_table[h], key_buffer, len) == 0)
490 return 0;
491 if (h == 0)
492 h = common_words_table_size;
493 }
494 }
495 int li = table[int(hc % header.table_size)];
496 return li < 0 ? &minus_one : lists + li;
497}
498
499static void merge(int *result, const int *s1, const int *s2)
500{
501 for (; *s1 >= 0; s1++) {
502 while (*s2 >= 0 && *s2 < *s1)
503 s2++;
504 if (*s2 == *s1)
505 *result++ = *s2;
506 }
507 *result++ = -1;
508}
509
510const int *index_search_item::search(const char *ptr, int length,
511 int **temp_listp)
512{
513 const char *end = ptr + length;
514 if (*temp_listp) {
515 a_delete *temp_listp;
516 *temp_listp = 0;
517 }
518 const int *first_list = 0;
519 while (ptr < end && (first_list = search1(&ptr, end)) == 0)
520 ;
521 if (!first_list)
522 return 0;
523 if (*first_list < 0)
524 return first_list;
525 const int *second_list = 0;
526 while (ptr < end && (second_list = search1(&ptr, end)) == 0)
527 ;
528 if (!second_list)
529 return first_list;
530 if (*second_list < 0)
531 return second_list;
532 const int *p;
533 for (p = first_list; *p >= 0; p++)
534 ;
535 int len = p - first_list;
536 for (p = second_list; *p >= 0; p++)
537 ;
538 if (p - second_list < len)
539 len = p - second_list;
540 int *matches = new int[len + 1];
541 merge(matches, first_list, second_list);
542 while (ptr < end) {
543 const int *list = search1(&ptr, end);
544 if (list != 0) {
545 if (*list < 0) {
546 a_delete matches;
547 return list;
548 }
549 merge(matches, matches, list);
550 if (*matches < 0) {
551 a_delete matches;
552 return &minus_one;
553 }
554 }
555 }
556 *temp_listp = matches;
557 return matches;
558}
559
560void index_search_item::read_common_words_file()
561{
562 if (header.common <= 0)
563 return;
564 const char *common_words_file = munge_filename(strchr(pool, '\0') + 1);
565 errno = 0;
566 FILE *fp = fopen(common_words_file, "r");
567 if (!fp) {
568 error("can't open `%1': %2", common_words_file, strerror(errno));
569 return;
570 }
571 common_words_table_size = 2*header.common + 1;
572 while (!is_prime(common_words_table_size))
573 common_words_table_size++;
574 common_words_table = new char *[common_words_table_size];
575 for (int i = 0; i < common_words_table_size; i++)
576 common_words_table[i] = 0;
577 int count = 0;
578 int key_len = 0;
579 for (;;) {
580 int c = getc(fp);
581 while (c != EOF && !csalnum(c))
582 c = getc(fp);
583 if (c == EOF)
584 break;
585 do {
586 if (key_len < header.truncate)
587 key_buffer[key_len++] = cmlower(c);
588 c = getc(fp);
589 } while (c != EOF && csalnum(c));
590 if (key_len >= header.shortest) {
591 int h = hash(key_buffer, key_len) % common_words_table_size;
592 while (common_words_table[h]) {
593 if (h == 0)
594 h = common_words_table_size;
595 --h;
596 }
597 common_words_table[h] = new char[key_len + 1];
598 memcpy(common_words_table[h], key_buffer, key_len);
599 common_words_table[h][key_len] = '\0';
600 }
601 if (++count >= header.common)
602 break;
603 key_len = 0;
604 if (c == EOF)
605 break;
606 }
607 fclose(fp);
608}
609
610void index_search_item::add_out_of_date_file(int fd, const char *filename,
611 int fid)
612{
613 search_item **pp;
614 for (pp = &out_of_date_files; *pp; pp = &(*pp)->next)
615 if ((*pp)->is_named(filename))
616 return;
617 *pp = make_linear_search_item(fd, filename, fid);
618 warning("`%1' modified since `%2' created", filename, name);
619}
620
621void index_search_item::check_files()
622{
623 const char *pool_end = pool + header.strings_size;
624 for (const char *ptr = strchr(ignore_fields, '\0') + 1;
625 ptr < pool_end;
626 ptr = strchr(ptr, '\0') + 1) {
627 const char *path = munge_filename(ptr);
628 struct stat sb;
629 if (stat(path, &sb) < 0)
630 error("can't stat `%1': %2", path, strerror(errno));
631 else if (sb.st_mtime > mtime) {
632 int fd = open(path, O_RDONLY | O_BINARY);
633 if (fd < 0)
634 error("can't open `%1': %2", path, strerror(errno));
635 else
636 add_out_of_date_file(fd, path, filename_id + (ptr - pool));
637 }
638 }
639}