Commit | Line | Data |
---|---|---|
984263bc MD |
1 | /* |
2 | * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin. | |
3 | * Copyright (c) 1989, 1993 | |
4 | * The Regents of the University of California. All rights reserved. | |
5 | * | |
6 | * This code is derived from software contributed to Berkeley by | |
7 | * James A. Woods. | |
8 | * | |
9 | * Redistribution and use in source and binary forms, with or without | |
10 | * modification, are permitted provided that the following conditions | |
11 | * are met: | |
12 | * 1. Redistributions of source code must retain the above copyright | |
13 | * notice, this list of conditions and the following disclaimer. | |
14 | * 2. Redistributions in binary form must reproduce the above copyright | |
15 | * notice, this list of conditions and the following disclaimer in the | |
16 | * documentation and/or other materials provided with the distribution. | |
17 | * 3. All advertising materials mentioning features or use of this software | |
18 | * must display the following acknowledgement: | |
19 | * This product includes software developed by the University of | |
20 | * California, Berkeley and its contributors. | |
21 | * 4. Neither the name of the University nor the names of its contributors | |
22 | * may be used to endorse or promote products derived from this software | |
23 | * without specific prior written permission. | |
24 | * | |
25 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
26 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
27 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
29 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
30 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
31 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
32 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
33 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
34 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
35 | * SUCH DAMAGE. | |
36 | * | |
37 | * $FreeBSD: src/usr.bin/locate/code/locate.code.c,v 1.11.2.1 2001/03/04 08:46:46 kris Exp $ | |
1de703da MD |
38 | * |
39 | * @(#) Copyright (c) 1989, 1993 The Regents of the University of California. All rights reserved. | |
40 | * @(#)locate.code.c 8.1 (Berkeley) 6/6/93 | |
984263bc MD |
41 | */ |
42 | ||
984263bc MD |
43 | /* |
44 | * PURPOSE: sorted list compressor (works with a modified 'find' | |
45 | * to encode/decode a filename database) | |
46 | * | |
47 | * USAGE: bigram < list > bigrams | |
48 | * process bigrams (see updatedb) > common_bigrams | |
49 | * code common_bigrams < list > squozen_list | |
50 | * | |
51 | * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 | |
52 | * February/March 1983, p. 8). Output format is, per line, an | |
53 | * offset differential count byte followed by a partially bigram- | |
54 | * encoded ascii residue. A bigram is a two-character sequence, | |
55 | * the first 128 most common of which are encoded in one byte. | |
56 | * | |
57 | * EXAMPLE: For simple front compression with no bigram encoding, | |
58 | * if the input is... then the output is... | |
59 | * | |
60 | * /usr/src 0 /usr/src | |
61 | * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c | |
62 | * /usr/src/cmd/armadillo.c 14 armadillo.c | |
63 | * /usr/tmp/zoo 5 tmp/zoo | |
64 | * | |
65 | * The codes are: | |
66 | * | |
67 | * 0-28 likeliest differential counts + offset to make nonnegative | |
68 | * 30 switch code for out-of-range count to follow in next word | |
69 | * 31 an 8 bit char followed | |
70 | * 128-255 bigram codes (128 most common, as determined by 'updatedb') | |
71 | * 32-127 single character (printable) ascii residue (ie, literal) | |
72 | * | |
73 | * The locate database store any character except newline ('\n') | |
74 | * and NUL ('\0'). The 8-bit character support don't wast extra | |
75 | * space until you have characters in file names less than 32 | |
76 | * or greather than 127. | |
77 | * | |
78 | * | |
79 | * SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c | |
80 | * | |
81 | * AUTHOR: James A. Woods, Informatics General Corp., | |
82 | * NASA Ames Research Center, 10/82 | |
83 | * 8-bit file names characters: | |
84 | * Wolfram Schneider, Berlin September 1996 | |
85 | */ | |
86 | ||
87 | #include <sys/param.h> | |
88 | #include <err.h> | |
89 | #include <errno.h> | |
90 | #include <stdlib.h> | |
91 | #include <string.h> | |
92 | #include <stdio.h> | |
93 | #include <unistd.h> | |
94 | #include "locate.h" | |
95 | ||
96 | #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ | |
97 | ||
a4f37ab4 SW |
98 | static u_char buf1[MAXPATHLEN] = " "; |
99 | static u_char buf2[MAXPATHLEN]; | |
100 | static u_char bigrams[BGBUFSIZE + 1] = { 0 }; | |
984263bc MD |
101 | |
102 | #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ | |
103 | ||
104 | #ifdef LOOKUP | |
105 | #define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)]) | |
106 | typedef short bg_t; | |
a4f37ab4 | 107 | static bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1]; |
984263bc MD |
108 | #else |
109 | #define BGINDEX(x) bgindex(x) | |
110 | typedef int bg_t; | |
a4f37ab4 | 111 | static int bgindex(char *); |
984263bc MD |
112 | #endif /* LOOKUP */ |
113 | ||
114 | ||
6d08986d | 115 | static void usage(void) __dead2; |
984263bc MD |
116 | |
117 | int | |
89a89091 | 118 | main(int argc, char *argv[]) |
984263bc | 119 | { |
6bd457ed | 120 | u_char *cp, *oldpath, *path; |
984263bc MD |
121 | int ch, code, count, diffcount, oldcount; |
122 | FILE *fp; | |
6bd457ed | 123 | int i, j; |
984263bc MD |
124 | |
125 | while ((ch = getopt(argc, argv, "")) != -1) | |
126 | switch(ch) { | |
127 | default: | |
128 | usage(); | |
129 | } | |
130 | argc -= optind; | |
131 | argv += optind; | |
132 | ||
133 | if (argc != 1) | |
134 | usage(); | |
135 | ||
136 | if ((fp = fopen(argv[0], "r")) == NULL) | |
137 | err(1, "%s", argv[0]); | |
138 | ||
139 | /* First copy bigram array to stdout. */ | |
140 | (void)fgets(bigrams, BGBUFSIZE + 1, fp); | |
141 | ||
142 | if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) | |
143 | err(1, "stdout"); | |
144 | (void)fclose(fp); | |
145 | ||
146 | #ifdef LOOKUP | |
147 | /* init lookup table */ | |
148 | for (i = 0; i < UCHAR_MAX + 1; i++) | |
149 | for (j = 0; j < UCHAR_MAX + 1; j++) | |
150 | big[i][j] = (bg_t)-1; | |
151 | ||
152 | for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2) | |
153 | big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i; | |
154 | ||
155 | #endif /* LOOKUP */ | |
156 | ||
157 | oldpath = buf1; | |
158 | path = buf2; | |
159 | oldcount = 0; | |
160 | ||
161 | while (fgets(path, sizeof(buf2), stdin) != NULL) { | |
162 | ||
163 | /* skip empty lines */ | |
164 | if (*path == '\n') | |
165 | continue; | |
166 | ||
167 | /* remove newline */ | |
168 | for (cp = path; *cp != '\0'; cp++) { | |
169 | #ifndef LOCATE_CHAR30 | |
170 | /* old locate implementations core'd for char 30 */ | |
171 | if (*cp == SWITCH) | |
172 | *cp = '?'; | |
173 | else | |
174 | #endif /* !LOCATE_CHAR30 */ | |
175 | ||
176 | /* chop newline */ | |
177 | if (*cp == '\n') | |
178 | *cp = '\0'; | |
179 | } | |
180 | ||
181 | /* Skip longest common prefix. */ | |
182 | for (cp = path; *cp == *oldpath; cp++, oldpath++) | |
183 | if (*cp == '\0') | |
184 | break; | |
185 | ||
186 | count = cp - path; | |
187 | diffcount = count - oldcount + OFFSET; | |
188 | oldcount = count; | |
189 | if (diffcount < 0 || diffcount > 2 * OFFSET) { | |
190 | if (putchar(SWITCH) == EOF || | |
191 | putw(diffcount, stdout) == EOF) | |
192 | err(1, "stdout"); | |
193 | } else | |
194 | if (putchar(diffcount) == EOF) | |
195 | err(1, "stdout"); | |
196 | ||
197 | while (*cp != '\0') { | |
198 | /* print *two* characters */ | |
199 | ||
200 | if ((code = BGINDEX(cp)) != (bg_t)-1) { | |
201 | /* | |
202 | * print *one* as bigram | |
203 | * Found, so mark byte with | |
204 | * parity bit. | |
205 | */ | |
206 | if (putchar((code / 2) | PARITY) == EOF) | |
207 | err(1, "stdout"); | |
208 | cp += 2; | |
209 | } | |
210 | ||
211 | else { | |
212 | for (i = 0; i < 2; i++) { | |
213 | if (*cp == '\0') | |
214 | break; | |
215 | ||
216 | /* print umlauts in file names */ | |
217 | if (*cp < ASCII_MIN || | |
218 | *cp > ASCII_MAX) { | |
219 | if (putchar(UMLAUT) == EOF || | |
220 | putchar(*cp++) == EOF) | |
221 | err(1, "stdout"); | |
222 | } | |
223 | ||
224 | else { | |
225 | /* normal character */ | |
226 | if(putchar(*cp++) == EOF) | |
227 | err(1, "stdout"); | |
228 | } | |
229 | } | |
230 | ||
231 | } | |
232 | } | |
233 | ||
234 | if (path == buf1) { /* swap pointers */ | |
235 | path = buf2; | |
236 | oldpath = buf1; | |
237 | } else { | |
238 | path = buf1; | |
239 | oldpath = buf2; | |
240 | } | |
241 | } | |
242 | /* Non-zero status if there were errors */ | |
243 | if (fflush(stdout) != 0 || ferror(stdout)) | |
244 | exit(1); | |
245 | exit(0); | |
246 | } | |
247 | ||
248 | #ifndef LOOKUP | |
a4f37ab4 | 249 | static int |
984263bc MD |
250 | bgindex(bg) /* Return location of bg in bigrams or -1. */ |
251 | char *bg; | |
252 | { | |
6bd457ed | 253 | char bg0, bg1, *p; |
984263bc MD |
254 | |
255 | bg0 = bg[0]; | |
256 | bg1 = bg[1]; | |
257 | for (p = bigrams; *p != NULL; p++) | |
258 | if (*p++ == bg0 && *p == bg1) | |
259 | break; | |
260 | return (*p == NULL ? -1 : (--p - bigrams)); | |
261 | } | |
262 | #endif /* !LOOKUP */ | |
263 | ||
a4f37ab4 | 264 | static void |
89a89091 | 265 | usage(void) |
984263bc MD |
266 | { |
267 | (void)fprintf(stderr, | |
268 | "usage: locate.code common_bigrams < list > squozen_list\n"); | |
269 | exit(1); | |
270 | } |