| Commit | Line | Data |
|---|---|---|
| 984263bc MD |
1 | /* |
| 2 | * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin. | |
| 3 | * Copyright (c) 1989, 1993 | |
| 4 | * The Regents of the University of California. All rights reserved. | |
| 5 | * | |
| 6 | * This code is derived from software contributed to Berkeley by | |
| 7 | * James A. Woods. | |
| 8 | * | |
| 9 | * Redistribution and use in source and binary forms, with or without | |
| 10 | * modification, are permitted provided that the following conditions | |
| 11 | * are met: | |
| 12 | * 1. Redistributions of source code must retain the above copyright | |
| 13 | * notice, this list of conditions and the following disclaimer. | |
| 14 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 15 | * notice, this list of conditions and the following disclaimer in the | |
| 16 | * documentation and/or other materials provided with the distribution. | |
| 17 | * 3. All advertising materials mentioning features or use of this software | |
| 18 | * must display the following acknowledgement: | |
| 19 | * This product includes software developed by the University of | |
| 20 | * California, Berkeley and its contributors. | |
| 21 | * 4. Neither the name of the University nor the names of its contributors | |
| 22 | * may be used to endorse or promote products derived from this software | |
| 23 | * without specific prior written permission. | |
| 24 | * | |
| 25 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
| 26 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 27 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
| 29 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 30 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 31 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 32 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 33 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 34 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 35 | * SUCH DAMAGE. | |
| 36 | * | |
| 37 | * $FreeBSD: src/usr.bin/locate/code/locate.code.c,v 1.11.2.1 2001/03/04 08:46:46 kris Exp $ | |
| 1de703da MD |
38 | * |
| 39 | * @(#) Copyright (c) 1989, 1993 The Regents of the University of California. All rights reserved. | |
| 40 | * @(#)locate.code.c 8.1 (Berkeley) 6/6/93 | |
| 984263bc MD |
41 | */ |
| 42 | ||
| 984263bc MD |
43 | /* |
| 44 | * PURPOSE: sorted list compressor (works with a modified 'find' | |
| 45 | * to encode/decode a filename database) | |
| 46 | * | |
| 47 | * USAGE: bigram < list > bigrams | |
| 48 | * process bigrams (see updatedb) > common_bigrams | |
| 49 | * code common_bigrams < list > squozen_list | |
| 50 | * | |
| 51 | * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 | |
| 52 | * February/March 1983, p. 8). Output format is, per line, an | |
| 53 | * offset differential count byte followed by a partially bigram- | |
| 54 | * encoded ascii residue. A bigram is a two-character sequence, | |
| 55 | * the first 128 most common of which are encoded in one byte. | |
| 56 | * | |
| 57 | * EXAMPLE: For simple front compression with no bigram encoding, | |
| 58 | * if the input is... then the output is... | |
| 59 | * | |
| 60 | * /usr/src 0 /usr/src | |
| 61 | * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c | |
| 62 | * /usr/src/cmd/armadillo.c 14 armadillo.c | |
| 63 | * /usr/tmp/zoo 5 tmp/zoo | |
| 64 | * | |
| 65 | * The codes are: | |
| 66 | * | |
| 67 | * 0-28 likeliest differential counts + offset to make nonnegative | |
| 68 | * 30 switch code for out-of-range count to follow in next word | |
| 69 | * 31 an 8 bit char followed | |
| 70 | * 128-255 bigram codes (128 most common, as determined by 'updatedb') | |
| 71 | * 32-127 single character (printable) ascii residue (ie, literal) | |
| 72 | * | |
| 73 | * The locate database store any character except newline ('\n') | |
| 74 | * and NUL ('\0'). The 8-bit character support don't wast extra | |
| 75 | * space until you have characters in file names less than 32 | |
| 76 | * or greather than 127. | |
| 77 | * | |
| 78 | * | |
| 79 | * SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c | |
| 80 | * | |
| 81 | * AUTHOR: James A. Woods, Informatics General Corp., | |
| 82 | * NASA Ames Research Center, 10/82 | |
| 83 | * 8-bit file names characters: | |
| 84 | * Wolfram Schneider, Berlin September 1996 | |
| 85 | */ | |
| 86 | ||
| 87 | #include <sys/param.h> | |
| 88 | #include <err.h> | |
| 89 | #include <errno.h> | |
| 90 | #include <stdlib.h> | |
| 91 | #include <string.h> | |
| 92 | #include <stdio.h> | |
| 93 | #include <unistd.h> | |
| 94 | #include "locate.h" | |
| 95 | ||
| 96 | #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ | |
| 97 | ||
| 98 | u_char buf1[MAXPATHLEN] = " "; | |
| 99 | u_char buf2[MAXPATHLEN]; | |
| 100 | u_char bigrams[BGBUFSIZE + 1] = { 0 }; | |
| 101 | ||
| 102 | #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ | |
| 103 | ||
| 104 | #ifdef LOOKUP | |
| 105 | #define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)]) | |
| 106 | typedef short bg_t; | |
| 107 | bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1]; | |
| 108 | #else | |
| 109 | #define BGINDEX(x) bgindex(x) | |
| 110 | typedef int bg_t; | |
| 2d8a3be7 | 111 | int bgindex(char *); |
| 984263bc MD |
112 | #endif /* LOOKUP */ |
| 113 | ||
| 114 | ||
| 2d8a3be7 | 115 | void usage(void); |
| 984263bc MD |
116 | |
| 117 | int | |
| 89a89091 | 118 | main(int argc, char *argv[]) |
| 984263bc | 119 | { |
| 6bd457ed | 120 | u_char *cp, *oldpath, *path; |
| 984263bc MD |
121 | int ch, code, count, diffcount, oldcount; |
| 122 | FILE *fp; | |
| 6bd457ed | 123 | int i, j; |
| 984263bc MD |
124 | |
| 125 | while ((ch = getopt(argc, argv, "")) != -1) | |
| 126 | switch(ch) { | |
| 127 | default: | |
| 128 | usage(); | |
| 129 | } | |
| 130 | argc -= optind; | |
| 131 | argv += optind; | |
| 132 | ||
| 133 | if (argc != 1) | |
| 134 | usage(); | |
| 135 | ||
| 136 | if ((fp = fopen(argv[0], "r")) == NULL) | |
| 137 | err(1, "%s", argv[0]); | |
| 138 | ||
| 139 | /* First copy bigram array to stdout. */ | |
| 140 | (void)fgets(bigrams, BGBUFSIZE + 1, fp); | |
| 141 | ||
| 142 | if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) | |
| 143 | err(1, "stdout"); | |
| 144 | (void)fclose(fp); | |
| 145 | ||
| 146 | #ifdef LOOKUP | |
| 147 | /* init lookup table */ | |
| 148 | for (i = 0; i < UCHAR_MAX + 1; i++) | |
| 149 | for (j = 0; j < UCHAR_MAX + 1; j++) | |
| 150 | big[i][j] = (bg_t)-1; | |
| 151 | ||
| 152 | for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2) | |
| 153 | big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i; | |
| 154 | ||
| 155 | #endif /* LOOKUP */ | |
| 156 | ||
| 157 | oldpath = buf1; | |
| 158 | path = buf2; | |
| 159 | oldcount = 0; | |
| 160 | ||
| 161 | while (fgets(path, sizeof(buf2), stdin) != NULL) { | |
| 162 | ||
| 163 | /* skip empty lines */ | |
| 164 | if (*path == '\n') | |
| 165 | continue; | |
| 166 | ||
| 167 | /* remove newline */ | |
| 168 | for (cp = path; *cp != '\0'; cp++) { | |
| 169 | #ifndef LOCATE_CHAR30 | |
| 170 | /* old locate implementations core'd for char 30 */ | |
| 171 | if (*cp == SWITCH) | |
| 172 | *cp = '?'; | |
| 173 | else | |
| 174 | #endif /* !LOCATE_CHAR30 */ | |
| 175 | ||
| 176 | /* chop newline */ | |
| 177 | if (*cp == '\n') | |
| 178 | *cp = '\0'; | |
| 179 | } | |
| 180 | ||
| 181 | /* Skip longest common prefix. */ | |
| 182 | for (cp = path; *cp == *oldpath; cp++, oldpath++) | |
| 183 | if (*cp == '\0') | |
| 184 | break; | |
| 185 | ||
| 186 | count = cp - path; | |
| 187 | diffcount = count - oldcount + OFFSET; | |
| 188 | oldcount = count; | |
| 189 | if (diffcount < 0 || diffcount > 2 * OFFSET) { | |
| 190 | if (putchar(SWITCH) == EOF || | |
| 191 | putw(diffcount, stdout) == EOF) | |
| 192 | err(1, "stdout"); | |
| 193 | } else | |
| 194 | if (putchar(diffcount) == EOF) | |
| 195 | err(1, "stdout"); | |
| 196 | ||
| 197 | while (*cp != '\0') { | |
| 198 | /* print *two* characters */ | |
| 199 | ||
| 200 | if ((code = BGINDEX(cp)) != (bg_t)-1) { | |
| 201 | /* | |
| 202 | * print *one* as bigram | |
| 203 | * Found, so mark byte with | |
| 204 | * parity bit. | |
| 205 | */ | |
| 206 | if (putchar((code / 2) | PARITY) == EOF) | |
| 207 | err(1, "stdout"); | |
| 208 | cp += 2; | |
| 209 | } | |
| 210 | ||
| 211 | else { | |
| 212 | for (i = 0; i < 2; i++) { | |
| 213 | if (*cp == '\0') | |
| 214 | break; | |
| 215 | ||
| 216 | /* print umlauts in file names */ | |
| 217 | if (*cp < ASCII_MIN || | |
| 218 | *cp > ASCII_MAX) { | |
| 219 | if (putchar(UMLAUT) == EOF || | |
| 220 | putchar(*cp++) == EOF) | |
| 221 | err(1, "stdout"); | |
| 222 | } | |
| 223 | ||
| 224 | else { | |
| 225 | /* normal character */ | |
| 226 | if(putchar(*cp++) == EOF) | |
| 227 | err(1, "stdout"); | |
| 228 | } | |
| 229 | } | |
| 230 | ||
| 231 | } | |
| 232 | } | |
| 233 | ||
| 234 | if (path == buf1) { /* swap pointers */ | |
| 235 | path = buf2; | |
| 236 | oldpath = buf1; | |
| 237 | } else { | |
| 238 | path = buf1; | |
| 239 | oldpath = buf2; | |
| 240 | } | |
| 241 | } | |
| 242 | /* Non-zero status if there were errors */ | |
| 243 | if (fflush(stdout) != 0 || ferror(stdout)) | |
| 244 | exit(1); | |
| 245 | exit(0); | |
| 246 | } | |
| 247 | ||
| 248 | #ifndef LOOKUP | |
| 249 | int | |
| 250 | bgindex(bg) /* Return location of bg in bigrams or -1. */ | |
| 251 | char *bg; | |
| 252 | { | |
| 6bd457ed | 253 | char bg0, bg1, *p; |
| 984263bc MD |
254 | |
| 255 | bg0 = bg[0]; | |
| 256 | bg1 = bg[1]; | |
| 257 | for (p = bigrams; *p != NULL; p++) | |
| 258 | if (*p++ == bg0 && *p == bg1) | |
| 259 | break; | |
| 260 | return (*p == NULL ? -1 : (--p - bigrams)); | |
| 261 | } | |
| 262 | #endif /* !LOOKUP */ | |
| 263 | ||
| 264 | void | |
| 89a89091 | 265 | usage(void) |
| 984263bc MD |
266 | { |
| 267 | (void)fprintf(stderr, | |
| 268 | "usage: locate.code common_bigrams < list > squozen_list\n"); | |
| 269 | exit(1); | |
| 270 | } |