groff: update vendor branch to v1.20.1
[dragonfly.git] / contrib / groff / src / libs / libbib / linear.cpp
CommitLineData
92d0a6a6 1// -*- C++ -*-
4d3e9548 2/* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001, 2009
92d0a6a6
JR
3 Free Software Foundation, Inc.
4 Written by James Clark (jjc@jclark.com)
5
6This file is part of groff.
7
8groff is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
4d3e9548
JL
10Software Foundation, either version 3 of the License, or
11(at your option) any later version.
92d0a6a6
JR
12
13groff is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
4d3e9548
JL
18You should have received a copy of the GNU General Public License
19along with this program. If not, see <http://www.gnu.org/licenses/>. */
92d0a6a6
JR
20
21#include "lib.h"
22
23#include <stdlib.h>
24#include <assert.h>
25#include <errno.h>
26
27#include "posix.h"
28#include "errarg.h"
29#include "error.h"
30#include "cset.h"
31#include "cmap.h"
32#include "nonposix.h"
33
34#include "refid.h"
35#include "search.h"
36
37class file_buffer {
38 char *buffer;
39 char *bufend;
40public:
41 file_buffer();
42 ~file_buffer();
43 int load(int fd, const char *filename);
44 const char *get_start() const;
45 const char *get_end() const;
46};
47
48typedef unsigned char uchar;
49
50static uchar map[256];
51static uchar inv_map[256][3];
52
53struct map_init {
54 map_init();
55};
56
57static map_init the_map_init;
58
59map_init::map_init()
60{
61 int i;
62 for (i = 0; i < 256; i++)
63 map[i] = csalnum(i) ? cmlower(i) : '\0';
64 for (i = 0; i < 256; i++) {
65 if (cslower(i)) {
66 inv_map[i][0] = i;
67 inv_map[i][1] = cmupper(i);
68 inv_map[i][2] = '\0';
69 }
70 else if (csdigit(i)) {
71 inv_map[i][0] = i;
72 inv_map[i][1] = 0;
73 }
74 else
75 inv_map[i][0] = '\0';
76 }
77}
78
79
80class bmpattern {
81 char *pat;
82 int len;
83 int delta[256];
84public:
85 bmpattern(const char *pattern, int pattern_length);
86 ~bmpattern();
87 const char *search(const char *p, const char *end) const;
88 int length() const;
89};
90
91bmpattern::bmpattern(const char *pattern, int pattern_length)
92: len(pattern_length)
93{
94 pat = new char[len];
95 int i;
96 for (i = 0; i < len; i++)
97 pat[i] = map[uchar(pattern[i])];
98 for (i = 0; i < 256; i++)
99 delta[i] = len;
100 for (i = 0; i < len; i++)
101 for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++)
102 delta[*inv] = len - i - 1;
103}
104
105const char *bmpattern::search(const char *buf, const char *end) const
106{
107 int buflen = end - buf;
108 if (len > buflen)
109 return 0;
110 const char *strend;
111 if (buflen > len*4)
112 strend = end - len*4;
113 else
114 strend = buf;
115 const char *k = buf + len - 1;
116 const int *del = delta;
117 const char *pattern = pat;
118 for (;;) {
119 while (k < strend) {
120 int t = del[uchar(*k)];
121 if (!t)
122 break;
123 k += t;
124 k += del[uchar(*k)];
125 k += del[uchar(*k)];
126 }
127 while (k < end && del[uchar(*k)] != 0)
128 k++;
129 if (k == end)
130 break;
131 int j = len - 1;
132 const char *s = k;
133 for (;;) {
134 if (j == 0)
135 return s;
136 if (map[uchar(*--s)] != uchar(pattern[--j]))
137 break;
138 }
139 k++;
140 }
141 return 0;
142}
143
144bmpattern::~bmpattern()
145{
146 a_delete pat;
147}
148
149inline int bmpattern::length() const
150{
151 return len;
152}
153
154
155static const char *find_end(const char *bufend, const char *p);
156
157const char *linear_searcher::search_and_check(const bmpattern *key,
158 const char *buf, const char *bufend, const char **start) const
159{
160 assert(buf[-1] == '\n');
161 assert(bufend[-1] == '\n');
162 const char *ptr = buf;
163 for (;;) {
164 const char *found = key->search(ptr, bufend);
165 if (!found)
166 break;
167 if (check_match(buf, bufend, found, key->length(), &ptr, start))
168 return found;
169 }
170 return 0;
171}
172
173static const char *skip_field(const char *end, const char *p)
174{
175 for (;;)
176 if (*p++ == '\n') {
177 if (p == end || *p == '%')
178 break;
179 const char *q;
180 for (q = p; *q == ' ' || *q == '\t'; q++)
181 ;
182 if (*q == '\n')
183 break;
184 p = q + 1;
185 }
186 return p;
187}
188
189static const char *find_end(const char *bufend, const char *p)
190{
191 for (;;)
192 if (*p++ == '\n') {
193 if (p == bufend)
194 break;
195 const char *q;
196 for (q = p; *q == ' ' || *q == '\t'; q++)
197 ;
198 if (*q == '\n')
199 break;
200 p = q + 1;
201 }
202 return p;
203}
204
205
206int linear_searcher::check_match(const char *buf, const char *bufend,
207 const char *match, int matchlen,
208 const char **cont, const char **start) const
209{
210 *cont = match + 1;
211 // The user is required to supply only the first truncate_len characters
212 // of the key. If truncate_len <= 0, he must supply all the key.
213 if ((truncate_len <= 0 || matchlen < truncate_len)
214 && map[uchar(match[matchlen])] != '\0')
215 return 0;
216
217 // The character before the match must not be an alphanumeric
218 // character (unless the alphanumeric character follows one or two
219 // percent characters at the beginning of the line), nor must it be
220 // a percent character at the beginning of a line, nor a percent
221 // character following a percent character at the beginning of a
222 // line.
223
224 switch (match - buf) {
225 case 0:
226 break;
227 case 1:
228 if (match[-1] == '%' || map[uchar(match[-1])] != '\0')
229 return 0;
230 break;
231 case 2:
232 if (map[uchar(match[-1])] != '\0' && match[-2] != '%')
233 return 0;
234 if (match[-1] == '%'
235 && (match[-2] == '\n' || match[-2] == '%'))
236 return 0;
237 break;
238 default:
239 if (map[uchar(match[-1])] != '\0'
240 && !(match[-2] == '%'
241 && (match[-3] == '\n'
242 || (match[-3] == '%' && match[-4] == '\n'))))
243 return 0;
244 if (match[-1] == '%'
245 && (match[-2] == '\n'
246 || (match[-2] == '%' && match[-3] == '\n')))
247 return 0;
248 }
249
250 const char *p = match;
251 int had_percent = 0;
252 for (;;) {
253 if (*p == '\n') {
254 if (!had_percent && p[1] == '%') {
255 if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) {
256 *cont = skip_field(bufend, match + matchlen);
257 return 0;
258 }
259 if (!start)
260 break;
261 had_percent = 1;
262 }
263 if (p <= buf) {
264 if (start)
265 *start = p + 1;
266 return 1;
267 }
268 const char *q;
269 for (q = p - 1; *q == ' ' || *q == '\t'; q--)
270 ;
271 if (*q == '\n') {
272 if (start)
273 *start = p + 1;
274 break;
275 }
276 p = q;
277 }
278 p--;
279 }
280 return 1;
281}
282
283file_buffer::file_buffer()
284: buffer(0), bufend(0)
285{
286}
287
288file_buffer::~file_buffer()
289{
290 a_delete buffer;
291}
292
293const char *file_buffer::get_start() const
294{
295 return buffer ? buffer + 4 : 0;
296}
297
298const char *file_buffer::get_end() const
299{
300 return bufend;
301}
302
303int file_buffer::load(int fd, const char *filename)
304{
305 struct stat sb;
306 if (fstat(fd, &sb) < 0)
307 error("can't fstat `%1': %2", filename, strerror(errno));
308 else if (!S_ISREG(sb.st_mode))
309 error("`%1' is not a regular file", filename);
310 else {
311 // We need one character extra at the beginning for an additional newline
312 // used as a sentinel. We get 4 instead so that the read buffer will be
313 // word-aligned. This seems to make the read slightly faster. We also
314 // need one character at the end also for an additional newline used as a
315 // sentinel.
316 int size = int(sb.st_size);
317 buffer = new char[size + 4 + 1];
318 int nread = read(fd, buffer + 4, size);
319 if (nread < 0)
320 error("error reading `%1': %2", filename, strerror(errno));
321 else if (nread != size)
322 error("size of `%1' decreased", filename);
323 else {
324 char c;
325 nread = read(fd, &c, 1);
326 if (nread != 0)
327 error("size of `%1' increased", filename);
328 else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0)
329 error("database `%1' is a binary file", filename);
330 else {
331 close(fd);
332 buffer[3] = '\n';
333 int sidx = 4, didx = 4;
334 for ( ; sidx < size + 4; sidx++, didx++)
335 {
336 if (buffer[sidx] == '\r')
337 {
338 if (buffer[++sidx] != '\n')
339 buffer[didx++] = '\r';
340 else
341 size--;
342 }
343 if (sidx != didx)
344 buffer[didx] = buffer[sidx];
345 }
346 bufend = buffer + 4 + size;
347 if (bufend[-1] != '\n')
348 *bufend++ = '\n';
349 return 1;
350 }
351 }
352 a_delete buffer;
353 buffer = 0;
354 }
355 close(fd);
356 return 0;
357}
358
359linear_searcher::linear_searcher(const char *query, int query_len,
360 const char *ign, int trunc)
361: ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0)
362{
363 const char *query_end = query + query_len;
364 int nk = 0;
365 const char *p;
366 for (p = query; p < query_end; p++)
367 if (map[uchar(*p)] != '\0'
368 && (p[1] == '\0' || map[uchar(p[1])] == '\0'))
369 nk++;
370 if (nk == 0)
371 return;
372 keys = new bmpattern*[nk];
373 p = query;
374 for (;;) {
375 while (p < query_end && map[uchar(*p)] == '\0')
376 p++;
377 if (p == query_end)
378 break;
379 const char *start = p;
380 while (p < query_end && map[uchar(*p)] != '\0')
381 p++;
382 keys[nkeys++] = new bmpattern(start, p - start);
383 }
384 assert(nkeys <= nk);
385 if (nkeys == 0) {
386 a_delete keys;
387 keys = 0;
388 }
389}
390
391linear_searcher::~linear_searcher()
392{
393 for (int i = 0; i < nkeys; i++)
394 delete keys[i];
395 a_delete keys;
396}
397
398int linear_searcher::search(const char *buffer, const char *bufend,
399 const char **startp, int *lengthp) const
400{
401 assert(bufend - buffer > 0);
402 assert(buffer[-1] == '\n');
403 assert(bufend[-1] == '\n');
404 if (nkeys == 0)
405 return 0;
406 for (;;) {
407 const char *refstart;
408 const char *found = search_and_check(keys[0], buffer, bufend, &refstart);
409 if (!found)
410 break;
411 const char *refend = find_end(bufend, found + keys[0]->length());
412 int i;
413 for (i = 1; i < nkeys; i++)
414 if (!search_and_check(keys[i], refstart, refend))
415 break;
416 if (i >= nkeys) {
417 *startp = refstart;
418 *lengthp = refend - refstart;
419 return 1;
420 }
421 buffer = refend;
422 }
423 return 0;
424}
425
426class linear_search_item : public search_item {
427 file_buffer fbuf;
428public:
429 linear_search_item(const char *filename, int fid);
430 ~linear_search_item();
431 int load(int fd);
432 search_item_iterator *make_search_item_iterator(const char *);
433 friend class linear_search_item_iterator;
434};
435
436class linear_search_item_iterator : public search_item_iterator {
437 linear_search_item *lsi;
438 int pos;
439public:
440 linear_search_item_iterator(linear_search_item *, const char *query);
441 ~linear_search_item_iterator();
442 int next(const linear_searcher &, const char **ptr, int *lenp,
443 reference_id *ridp);
444};
445
446search_item *make_linear_search_item(int fd, const char *filename, int fid)
447{
448 linear_search_item *item = new linear_search_item(filename, fid);
449 if (!item->load(fd)) {
450 delete item;
451 return 0;
452 }
453 else
454 return item;
455}
456
457linear_search_item::linear_search_item(const char *filename, int fid)
458: search_item(filename, fid)
459{
460}
461
462linear_search_item::~linear_search_item()
463{
464}
465
466int linear_search_item::load(int fd)
467{
468 return fbuf.load(fd, name);
469}
470
471search_item_iterator *linear_search_item::make_search_item_iterator(
472 const char *query)
473{
474 return new linear_search_item_iterator(this, query);
475}
476
477linear_search_item_iterator::linear_search_item_iterator(
478 linear_search_item *p, const char *)
479: lsi(p), pos(0)
480{
481}
482
483linear_search_item_iterator::~linear_search_item_iterator()
484{
485}
486
487int linear_search_item_iterator::next(const linear_searcher &searcher,
488 const char **startp, int *lengthp,
489 reference_id *ridp)
490{
491 const char *bufstart = lsi->fbuf.get_start();
492 const char *bufend = lsi->fbuf.get_end();
493 const char *ptr = bufstart + pos;
494 if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) {
495 pos = *startp + *lengthp - bufstart;
496 if (ridp)
497 *ridp = reference_id(lsi->filename_id, *startp - bufstart);
498 return 1;
499 }
500 else
501 return 0;
502}