2 /* Copyright (C) 1989-1992, 2000, 2001, 2002, 2003, 2004, 2009
3 Free Software Foundation, Inc.
4 Written by James Clark (jjc@jclark.com)
6 This file is part of groff.
8 groff is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
30 #include "stringclass.h"
39 extern "C" const char *Version_string;
41 #define DEFAULT_HASH_TABLE_SIZE 997
42 #define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
44 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
46 #define MALLOC_OVERHEAD 16
52 const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
53 - sizeof(int)) / sizeof(int));
59 block(block *p = 0) : next(p), used(0) { }
73 word_list(const char *, int, word_list *);
76 table_entry *hash_table;
77 int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
78 // We make this the same size as hash_table so we only have to do one
80 static word_list **common_words_table = 0;
86 char *temp_index_file = 0;
88 const char *ignore_fields = "XYZ";
89 const char *common_words_file = COMMON_WORDS_FILE;
90 int n_ignore_words = 100;
93 int max_keys_per_item = 100;
95 static void usage(FILE *stream);
96 static void write_hash_table();
97 static void init_hash_table();
98 static void read_common_words_file();
99 static int store_key(char *s, int len);
100 static void possibly_store_key(char *s, int len);
101 static int do_whole_file(const char *filename);
102 static int do_file(const char *filename);
103 static void store_reference(int filename_index, int pos, int len);
104 static void check_integer_arg(char opt, const char *arg, int min, int *res);
105 static void store_filename(const char *);
106 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
107 static char *get_cwd();
111 void catch_fatal_signals();
112 void ignore_fatal_signals();
115 int main(int argc, char **argv)
117 program_name = argv[0];
118 static char stderr_buf[BUFSIZ];
119 setbuf(stderr, stderr_buf);
121 const char *base_name = 0;
122 typedef int (*parser_t)(const char *);
123 parser_t parser = do_file;
124 const char *directory = 0;
125 const char *foption = 0;
127 static const struct option long_options[] = {
128 { "help", no_argument, 0, CHAR_MAX + 1 },
129 { "version", no_argument, 0, 'v' },
132 while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
137 common_words_file = optarg;
146 check_integer_arg('h', optarg, 1, &hash_table_size);
147 if (!is_prime(hash_table_size)) {
148 while (!is_prime(++hash_table_size))
150 warning("%1 not prime: using %2 instead", optarg, hash_table_size);
154 ignore_fields = optarg;
157 check_integer_arg('k', optarg, 1, &max_keys_per_item);
160 check_integer_arg('l', optarg, 0, &shortest_len);
163 check_integer_arg('n', optarg, 0, &n_ignore_words);
169 check_integer_arg('t', optarg, 1, &truncate_len);
172 parser = do_whole_file;
175 printf("GNU indxbib (groff) version %s\n", Version_string);
178 case CHAR_MAX + 1: // --help
190 if (optind >= argc && foption == 0)
191 fatal("no files and no -f option");
193 char *path = get_cwd();
194 store_filename(path);
198 store_filename(directory);
200 store_filename(common_words_file);
201 store_filename(ignore_fields);
202 key_buffer = new char[truncate_len];
203 read_common_words_file();
205 base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
206 const char *p = strrchr(base_name, DIR_SEPS[0]), *p1;
207 const char *sep = &DIR_SEPS[1];
209 p1 = strrchr(base_name, *sep);
210 if (p1 && (!p || p1 > p))
216 char *dir = strsave(base_name);
217 dir[p - base_name] = '\0';
218 name_max = file_name_max(dir);
222 name_max = file_name_max(".");
223 const char *filename = p ? p + 1 : base_name;
224 if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
225 fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
228 temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)];
229 memcpy(temp_index_file, base_name, p - base_name);
230 strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE);
233 temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
235 catch_fatal_signals();
236 int fd = mkstemp(temp_index_file);
238 fatal("can't create temporary index file: %1", strerror(errno));
239 indxfp = fdopen(fd, FOPEN_WB);
241 fatal("fdopen failed");
242 if (fseek(indxfp, sizeof(index_header), 0) < 0)
243 fatal("can't seek past index header: %1", strerror(errno));
247 if (strcmp(foption, "-") != 0) {
249 fp = fopen(foption, "r");
251 fatal("can't open `%1': %2", foption, strerror(errno));
257 for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
259 error_with_file_and_line(foption, lineno,
260 "nul character in pathname ignored");
264 if (path.length() > 0) {
266 if (!(*parser)(path.contents()))
277 for (int i = optind; i < argc; i++)
278 if (!(*parser)(argv[i]))
281 if (fclose(indxfp) < 0)
282 fatal("error closing temporary index file: %1", strerror(errno));
283 char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)];
284 strcpy(index_file, base_name);
285 strcat(index_file, INDEX_SUFFIX);
288 if (access(index_file, R_OK) == 0)
291 if (rename(temp_index_file, index_file) < 0) {
293 // RENAME could fail on plain MSDOS filesystems because
294 // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
295 char *fname = p ? index_file + (p - base_name) : 0;
298 // Replace the dot with an underscore and try again.
300 && (dot = strchr(fname, '.')) != 0
301 && strcmp(dot, INDEX_SUFFIX) != 0)
303 if (rename(temp_index_file, index_file) < 0)
305 fatal("can't rename temporary index file: %1", strerror(errno));
307 #else /* not HAVE_RENAME */
308 ignore_fatal_signals();
309 if (unlink(index_file) < 0) {
311 fatal("can't unlink `%1': %2", index_file, strerror(errno));
313 if (link(temp_index_file, index_file) < 0)
314 fatal("can't link temporary index file: %1", strerror(errno));
315 if (unlink(temp_index_file) < 0)
316 fatal("can't unlink temporary index file: %1", strerror(errno));
317 #endif /* not HAVE_RENAME */
322 static void usage(FILE *stream)
325 "usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
326 " [-l n] [-n n] [-o base] [-t n] [files...]\n",
330 static void check_integer_arg(char opt, const char *arg, int min, int *res)
333 long n = strtol(arg, &ptr, 10);
334 if (n == 0 && ptr == arg)
335 error("argument to -%1 not an integer", opt);
337 error("argument to -%1 must not be less than %2", opt, min);
340 error("argument to -%1 greater than maximum integer", opt);
341 else if (*ptr != '\0')
342 error("junk after integer argument to -%1", opt);
347 static char *get_cwd()
353 buf = new char[size];
354 if (getcwd(buf, size))
357 fatal("cannot get current working directory: %1", strerror(errno));
360 fatal("current working directory longer than INT_MAX");
361 if (size > INT_MAX/2)
369 word_list::word_list(const char *s, int n, word_list *p)
376 static void read_common_words_file()
378 if (n_ignore_words <= 0)
381 FILE *fp = fopen(common_words_file, "r");
383 fatal("can't open `%1': %2", common_words_file, strerror(errno));
384 common_words_table = new word_list * [hash_table_size];
385 for (int i = 0; i < hash_table_size; i++)
386 common_words_table[i] = 0;
391 while (c != EOF && !csalnum(c))
396 if (key_len < truncate_len)
397 key_buffer[key_len++] = cmlower(c);
399 } while (c != EOF && csalnum(c));
400 if (key_len >= shortest_len) {
401 int h = hash(key_buffer, key_len) % hash_table_size;
402 common_words_table[h] = new word_list(key_buffer, key_len,
403 common_words_table[h]);
405 if (++count >= n_ignore_words)
411 n_ignore_words = count;
415 static int do_whole_file(const char *filename)
418 FILE *fp = fopen(filename, "r");
420 error("can't open `%1': %2", filename, strerror(errno));
426 while ((c = getc(fp)) != EOF) {
430 while ((c = getc(fp)) != EOF) {
433 if (key_len < truncate_len)
434 key_buffer[key_len++] = c;
436 if (store_key(key_buffer, key_len)) {
437 if (++count >= max_keys_per_item)
444 store_reference(filenames.length(), 0, 0);
445 store_filename(filename);
450 static int do_file(const char *filename)
453 // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
454 // byte counts to be consistent with fseek.
455 FILE *fp = fopen(filename, FOPEN_RB);
457 error("can't open `%1': %2", filename, strerror(errno));
460 int filename_index = filenames.length();
461 store_filename(filename);
464 START, // at the start of the file; also in between references
465 BOL, // in the middle of a reference, at the beginning of the line
466 PERCENT, // seen a percent at the beginning of the line
467 IGNORE, // ignoring a field
468 IGNORE_BOL, // at the beginning of a line ignoring a field
469 KEY, // in the middle of a key
470 DISCARD, // after truncate_len bytes of a key
471 MIDDLE // in between keys
474 // In states START, BOL, IGNORE_BOL, space_count how many spaces at
475 // the beginning have been seen. In states PERCENT, IGNORE, KEY,
476 // MIDDLE space_count must be 0.
478 int byte_count = 0; // bytes read
480 int ref_start = -1; // position of start of current reference
485 // We opened the file in binary mode, so we need to skip
486 // every CR character before a Newline.
496 #if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
497 else if (c == 0x1a) // ^Z means EOF in text files
503 if (c == ' ' || c == '\t') {
511 ref_start = byte_count - space_count - 1;
515 else if (csalnum(c)) {
526 if (space_count > 0) {
538 store_reference(filename_index, ref_start,
539 byte_count - 1 - space_count - ref_start);
555 if (strchr(ignore_fields, c) != 0)
569 if (space_count > 0) {
581 store_reference(filename_index, ref_start,
582 byte_count - 1 - space_count - ref_start);
593 if (key_len < truncate_len)
594 key_buffer[key_len++] = c;
599 possibly_store_key(key_buffer, key_len);
609 possibly_store_key(key_buffer, key_len);
635 possibly_store_key(key_buffer, key_len);
642 store_reference(filename_index, ref_start,
643 byte_count - ref_start - space_count);
652 static void store_reference(int filename_index, int pos, int len)
655 t.filename_index = filename_index;
658 fwrite_or_die(&t, sizeof(t), 1, indxfp);
662 static void store_filename(const char *fn)
668 static void init_hash_table()
670 hash_table = new table_entry[hash_table_size];
671 for (int i = 0; i < hash_table_size; i++)
672 hash_table[i].ptr = 0;
675 static void possibly_store_key(char *s, int len)
677 static int last_tagno = -1;
678 static int key_count;
679 if (last_tagno != ntags) {
683 if (key_count < max_keys_per_item) {
684 if (store_key(s, len))
689 static int store_key(char *s, int len)
691 if (len < shortest_len)
694 for (int i = 0; i < len; i++)
695 if (!csdigit(s[i])) {
697 s[i] = cmlower(s[i]);
699 if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
701 int h = hash(s, len) % hash_table_size;
702 if (common_words_table) {
703 for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
704 if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
707 table_entry *pp = hash_table + h;
710 else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
712 else if (pp->ptr->used >= BLOCK_SIZE)
713 pp->ptr = new block(pp->ptr);
714 pp->ptr->v[(pp->ptr->used)++] = ntags;
718 static void write_hash_table()
720 const int minus_one = -1;
722 for (int i = 0; i < hash_table_size; i++) {
723 block *ptr = hash_table[i].ptr;
725 hash_table[i].count = -1;
727 hash_table[i].count = li;
736 fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
742 fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
746 if (sizeof(table_entry) == sizeof(int))
747 fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
749 // write it out word by word
750 for (int i = 0; i < hash_table_size; i++)
751 fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
753 fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
754 if (fseek(indxfp, 0, 0) < 0)
755 fatal("error seeking on index file: %1", strerror(errno));
757 h.magic = INDEX_MAGIC;
758 h.version = INDEX_VERSION;
761 h.table_size = hash_table_size;
762 h.strings_size = filenames.length();
763 h.truncate = truncate_len;
764 h.shortest = shortest_len;
765 h.common = n_ignore_words;
766 fwrite_or_die(&h, sizeof(h), 1, indxfp);
769 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
771 if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
772 fatal("fwrite failed: %1", strerror(errno));
775 void fatal_error_exit()
786 unlink(temp_index_file);