| 1 | /* |
| 2 | * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin. |
| 3 | * Copyright (c) 1989, 1993 |
| 4 | * The Regents of the University of California. All rights reserved. |
| 5 | * |
| 6 | * This code is derived from software contributed to Berkeley by |
| 7 | * James A. Woods. |
| 8 | * |
| 9 | * Redistribution and use in source and binary forms, with or without |
| 10 | * modification, are permitted provided that the following conditions |
| 11 | * are met: |
| 12 | * 1. Redistributions of source code must retain the above copyright |
| 13 | * notice, this list of conditions and the following disclaimer. |
| 14 | * 2. Redistributions in binary form must reproduce the above copyright |
| 15 | * notice, this list of conditions and the following disclaimer in the |
| 16 | * documentation and/or other materials provided with the distribution. |
| 17 | * 3. All advertising materials mentioning features or use of this software |
| 18 | * must display the following acknowledgement: |
| 19 | * This product includes software developed by the University of |
| 20 | * California, Berkeley and its contributors. |
| 21 | * 4. Neither the name of the University nor the names of its contributors |
| 22 | * may be used to endorse or promote products derived from this software |
| 23 | * without specific prior written permission. |
| 24 | * |
| 25 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 26 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 27 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 29 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 30 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 31 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 32 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 33 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 34 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 35 | * SUCH DAMAGE. |
| 36 | * |
| 37 | * $FreeBSD: src/usr.bin/locate/code/locate.code.c,v 1.11.2.1 2001/03/04 08:46:46 kris Exp $ |
| 38 | * |
| 39 | * @(#) Copyright (c) 1989, 1993 The Regents of the University of California. All rights reserved. |
| 40 | * @(#)locate.code.c 8.1 (Berkeley) 6/6/93 |
| 41 | */ |
| 42 | |
| 43 | /* |
| 44 | * PURPOSE: sorted list compressor (works with a modified 'find' |
| 45 | * to encode/decode a filename database) |
| 46 | * |
| 47 | * USAGE: bigram < list > bigrams |
| 48 | * process bigrams (see updatedb) > common_bigrams |
| 49 | * code common_bigrams < list > squozen_list |
| 50 | * |
| 51 | * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 |
| 52 | * February/March 1983, p. 8). Output format is, per line, an |
| 53 | * offset differential count byte followed by a partially bigram- |
| 54 | * encoded ascii residue. A bigram is a two-character sequence, |
| 55 | * the first 128 most common of which are encoded in one byte. |
| 56 | * |
| 57 | * EXAMPLE: For simple front compression with no bigram encoding, |
| 58 | * if the input is... then the output is... |
| 59 | * |
| 60 | * /usr/src 0 /usr/src |
| 61 | * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c |
| 62 | * /usr/src/cmd/armadillo.c 14 armadillo.c |
| 63 | * /usr/tmp/zoo 5 tmp/zoo |
| 64 | * |
| 65 | * The codes are: |
| 66 | * |
| 67 | * 0-28 likeliest differential counts + offset to make nonnegative |
| 68 | * 30 switch code for out-of-range count to follow in next word |
| 69 | * 31 an 8 bit char followed |
| 70 | * 128-255 bigram codes (128 most common, as determined by 'updatedb') |
| 71 | * 32-127 single character (printable) ascii residue (ie, literal) |
| 72 | * |
| 73 | * The locate database store any character except newline ('\n') |
| 74 | * and NUL ('\0'). The 8-bit character support don't wast extra |
| 75 | * space until you have characters in file names less than 32 |
| 76 | * or greather than 127. |
| 77 | * |
| 78 | * |
| 79 | * SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c |
| 80 | * |
| 81 | * AUTHOR: James A. Woods, Informatics General Corp., |
| 82 | * NASA Ames Research Center, 10/82 |
| 83 | * 8-bit file names characters: |
| 84 | * Wolfram Schneider, Berlin September 1996 |
| 85 | */ |
| 86 | |
| 87 | #include <sys/param.h> |
| 88 | #include <err.h> |
| 89 | #include <errno.h> |
| 90 | #include <stdlib.h> |
| 91 | #include <string.h> |
| 92 | #include <stdio.h> |
| 93 | #include <unistd.h> |
| 94 | #include "locate.h" |
| 95 | |
| 96 | #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ |
| 97 | |
| 98 | u_char buf1[MAXPATHLEN] = " "; |
| 99 | u_char buf2[MAXPATHLEN]; |
| 100 | u_char bigrams[BGBUFSIZE + 1] = { 0 }; |
| 101 | |
| 102 | #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ |
| 103 | |
| 104 | #ifdef LOOKUP |
| 105 | #define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)]) |
| 106 | typedef short bg_t; |
| 107 | bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1]; |
| 108 | #else |
| 109 | #define BGINDEX(x) bgindex(x) |
| 110 | typedef int bg_t; |
| 111 | int bgindex(char *); |
| 112 | #endif /* LOOKUP */ |
| 113 | |
| 114 | |
| 115 | void usage(void); |
| 116 | |
| 117 | int |
| 118 | main(int argc, char *argv[]) |
| 119 | { |
| 120 | u_char *cp, *oldpath, *path; |
| 121 | int ch, code, count, diffcount, oldcount; |
| 122 | FILE *fp; |
| 123 | int i, j; |
| 124 | |
| 125 | while ((ch = getopt(argc, argv, "")) != -1) |
| 126 | switch(ch) { |
| 127 | default: |
| 128 | usage(); |
| 129 | } |
| 130 | argc -= optind; |
| 131 | argv += optind; |
| 132 | |
| 133 | if (argc != 1) |
| 134 | usage(); |
| 135 | |
| 136 | if ((fp = fopen(argv[0], "r")) == NULL) |
| 137 | err(1, "%s", argv[0]); |
| 138 | |
| 139 | /* First copy bigram array to stdout. */ |
| 140 | (void)fgets(bigrams, BGBUFSIZE + 1, fp); |
| 141 | |
| 142 | if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) |
| 143 | err(1, "stdout"); |
| 144 | (void)fclose(fp); |
| 145 | |
| 146 | #ifdef LOOKUP |
| 147 | /* init lookup table */ |
| 148 | for (i = 0; i < UCHAR_MAX + 1; i++) |
| 149 | for (j = 0; j < UCHAR_MAX + 1; j++) |
| 150 | big[i][j] = (bg_t)-1; |
| 151 | |
| 152 | for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2) |
| 153 | big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i; |
| 154 | |
| 155 | #endif /* LOOKUP */ |
| 156 | |
| 157 | oldpath = buf1; |
| 158 | path = buf2; |
| 159 | oldcount = 0; |
| 160 | |
| 161 | while (fgets(path, sizeof(buf2), stdin) != NULL) { |
| 162 | |
| 163 | /* skip empty lines */ |
| 164 | if (*path == '\n') |
| 165 | continue; |
| 166 | |
| 167 | /* remove newline */ |
| 168 | for (cp = path; *cp != '\0'; cp++) { |
| 169 | #ifndef LOCATE_CHAR30 |
| 170 | /* old locate implementations core'd for char 30 */ |
| 171 | if (*cp == SWITCH) |
| 172 | *cp = '?'; |
| 173 | else |
| 174 | #endif /* !LOCATE_CHAR30 */ |
| 175 | |
| 176 | /* chop newline */ |
| 177 | if (*cp == '\n') |
| 178 | *cp = '\0'; |
| 179 | } |
| 180 | |
| 181 | /* Skip longest common prefix. */ |
| 182 | for (cp = path; *cp == *oldpath; cp++, oldpath++) |
| 183 | if (*cp == '\0') |
| 184 | break; |
| 185 | |
| 186 | count = cp - path; |
| 187 | diffcount = count - oldcount + OFFSET; |
| 188 | oldcount = count; |
| 189 | if (diffcount < 0 || diffcount > 2 * OFFSET) { |
| 190 | if (putchar(SWITCH) == EOF || |
| 191 | putw(diffcount, stdout) == EOF) |
| 192 | err(1, "stdout"); |
| 193 | } else |
| 194 | if (putchar(diffcount) == EOF) |
| 195 | err(1, "stdout"); |
| 196 | |
| 197 | while (*cp != '\0') { |
| 198 | /* print *two* characters */ |
| 199 | |
| 200 | if ((code = BGINDEX(cp)) != (bg_t)-1) { |
| 201 | /* |
| 202 | * print *one* as bigram |
| 203 | * Found, so mark byte with |
| 204 | * parity bit. |
| 205 | */ |
| 206 | if (putchar((code / 2) | PARITY) == EOF) |
| 207 | err(1, "stdout"); |
| 208 | cp += 2; |
| 209 | } |
| 210 | |
| 211 | else { |
| 212 | for (i = 0; i < 2; i++) { |
| 213 | if (*cp == '\0') |
| 214 | break; |
| 215 | |
| 216 | /* print umlauts in file names */ |
| 217 | if (*cp < ASCII_MIN || |
| 218 | *cp > ASCII_MAX) { |
| 219 | if (putchar(UMLAUT) == EOF || |
| 220 | putchar(*cp++) == EOF) |
| 221 | err(1, "stdout"); |
| 222 | } |
| 223 | |
| 224 | else { |
| 225 | /* normal character */ |
| 226 | if(putchar(*cp++) == EOF) |
| 227 | err(1, "stdout"); |
| 228 | } |
| 229 | } |
| 230 | |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | if (path == buf1) { /* swap pointers */ |
| 235 | path = buf2; |
| 236 | oldpath = buf1; |
| 237 | } else { |
| 238 | path = buf1; |
| 239 | oldpath = buf2; |
| 240 | } |
| 241 | } |
| 242 | /* Non-zero status if there were errors */ |
| 243 | if (fflush(stdout) != 0 || ferror(stdout)) |
| 244 | exit(1); |
| 245 | exit(0); |
| 246 | } |
| 247 | |
| 248 | #ifndef LOOKUP |
| 249 | int |
| 250 | bgindex(bg) /* Return location of bg in bigrams or -1. */ |
| 251 | char *bg; |
| 252 | { |
| 253 | char bg0, bg1, *p; |
| 254 | |
| 255 | bg0 = bg[0]; |
| 256 | bg1 = bg[1]; |
| 257 | for (p = bigrams; *p != NULL; p++) |
| 258 | if (*p++ == bg0 && *p == bg1) |
| 259 | break; |
| 260 | return (*p == NULL ? -1 : (--p - bigrams)); |
| 261 | } |
| 262 | #endif /* !LOOKUP */ |
| 263 | |
| 264 | void |
| 265 | usage(void) |
| 266 | { |
| 267 | (void)fprintf(stderr, |
| 268 | "usage: locate.code common_bigrams < list > squozen_list\n"); |
| 269 | exit(1); |
| 270 | } |