| Commit | Line | Data |
|---|---|---|
| 2c872e05 | 1 | /* @(#)compress.c 8.1 (Berkeley) 6/6/93 */ |
| abecab39 SW |
2 | /* $NetBSD: compress.c,v 1.7 2009/04/14 08:50:06 lukem Exp $ */ |
| 3 | ||
| 984263bc MD |
4 | /* |
| 5 | * Copyright (c) 1989, 1993 | |
| 6 | * The Regents of the University of California. All rights reserved. | |
| 7 | * | |
| 8 | * This code is derived from software contributed to Berkeley by | |
| 9 | * Edward Wang at The University of California, Berkeley. | |
| 10 | * | |
| 11 | * Redistribution and use in source and binary forms, with or without | |
| 12 | * modification, are permitted provided that the following conditions | |
| 13 | * are met: | |
| 14 | * 1. Redistributions of source code must retain the above copyright | |
| 15 | * notice, this list of conditions and the following disclaimer. | |
| 16 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 17 | * notice, this list of conditions and the following disclaimer in the | |
| 18 | * documentation and/or other materials provided with the distribution. | |
| abecab39 | 19 | * 3. Neither the name of the University nor the names of its contributors |
| 984263bc MD |
20 | * may be used to endorse or promote products derived from this software |
| 21 | * without specific prior written permission. | |
| 22 | * | |
| 23 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
| 24 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 25 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 26 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
| 27 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 28 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 29 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 31 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 32 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 33 | * SUCH DAMAGE. | |
| 34 | */ | |
| 35 | ||
| 984263bc | 36 | #include <fcntl.h> |
| abecab39 | 37 | #include <stdio.h> |
| 984263bc | 38 | #include <stdlib.h> |
| 7a2e9d59 | 39 | #include <string.h> |
| abecab39 | 40 | #include "defs.h" |
| 7a2e9d59 SW |
41 | #include "tt.h" |
| 42 | ||
| abecab39 | 43 | /* special */ |
| 984263bc MD |
44 | int cc_trace = 0; |
| 45 | FILE *cc_trace_fp; | |
| 46 | ||
| 47 | /* tunable parameters */ | |
| 48 | ||
| 49 | int cc_reverse = 1; | |
| 50 | int cc_sort = 0; | |
| 51 | int cc_chop = 0; | |
| 52 | ||
| 53 | int cc_token_max = 8; /* <= TOKEN_MAX */ | |
| 54 | int cc_token_min = 2; /* > tt.tt_put_token_cost */ | |
| 55 | int cc_npass0 = 1; | |
| 56 | int cc_npass1 = 1; | |
| 57 | ||
| 58 | int cc_bufsize = 1024 * 3; /* XXX, or 80 * 24 * 2 */ | |
| 59 | ||
| 60 | int cc_ntoken = 8192; | |
| 61 | ||
| 62 | #define cc_weight XXX | |
| 63 | #ifndef cc_weight | |
| 64 | int cc_weight = 0; | |
| 65 | #endif | |
| 66 | ||
| 67 | #define TOKEN_MAX 16 | |
| 68 | ||
| 69 | struct cc { | |
| 70 | char string[TOKEN_MAX]; | |
| 71 | char length; | |
| 72 | char flag; | |
| 73 | #ifndef cc_weight | |
| 74 | short weight; | |
| 75 | #endif | |
| 76 | long time; /* time last seen */ | |
| 77 | short bcount; /* count in this buffer */ | |
| 78 | short ccount; /* count in compression */ | |
| 79 | short places; /* places in the buffer */ | |
| 80 | short code; /* token code */ | |
| 81 | struct cc *qforw, *qback; | |
| 82 | struct cc *hforw, **hback; | |
| 83 | }; | |
| 84 | ||
| 85 | short cc_thresholds[TOKEN_MAX + 1]; | |
| 86 | #define thresh(length) (cc_thresholds[length]) | |
| 87 | #define threshp(code, count, length) \ | |
| 88 | ((code) >= 0 || (short) (count) >= cc_thresholds[length]) | |
| 89 | ||
| 90 | #ifndef cc_weight | |
| 91 | short cc_wthresholds[TOKEN_MAX + 1]; | |
| 92 | #define wthresh(length) (cc_wthresholds[length]) | |
| 93 | #define wthreshp(weight, length) ((short) (weight) >= cc_wthresholds[length]) | |
| 94 | #else | |
| 95 | #define wthreshp(weight, length) (0) | |
| 96 | #endif | |
| 97 | ||
| 98 | #ifndef cc_weight | |
| 99 | short cc_wlimits[TOKEN_MAX + 1]; | |
| 100 | #define wlimit(length) (cc_wlimits[length]) | |
| 101 | #endif | |
| 102 | ||
| 103 | #define put_token_score(length) ((length) - tt.tt_put_token_cost) | |
| 104 | ||
| 105 | int cc_score_adjustments[TOKEN_MAX + 1][8]; /* XXX, 8 > max of cc_thresholds */ | |
| 106 | #define score_adjust(score, p) \ | |
| 107 | do { \ | |
| abecab39 SW |
108 | int _length = (p)->length; \ |
| 109 | int _ccount = (p)->ccount; \ | |
| 110 | if (threshp((p)->code, _ccount, _length) || \ | |
| 111 | wthreshp((p)->weight, _length)) /* XXX */ \ | |
| 112 | (score) -= _length - tt.tt_put_token_cost; \ | |
| 984263bc | 113 | else \ |
| abecab39 | 114 | (score) += cc_score_adjustments[_length][_ccount]; \ |
| 984263bc MD |
115 | } while (0) |
| 116 | ||
| 117 | int cc_initial_scores[TOKEN_MAX + 1][8]; /* XXX, 8 > max of cc_thresholds */ | |
| 118 | ||
| 119 | struct cc cc_q0a, cc_q0b, cc_q1a, cc_q1b; | |
| 120 | ||
| 121 | #define qinsert(p1, p2) \ | |
| 122 | do { \ | |
| abecab39 SW |
123 | struct cc *forw = (p1)->qforw; \ |
| 124 | struct cc *back = (p1)->qback; \ | |
| 984263bc MD |
125 | back->qforw = forw; \ |
| 126 | forw->qback = back; \ | |
| 127 | forw = (p2)->qforw; \ | |
| 128 | (p1)->qforw = forw; \ | |
| 129 | forw->qback = (p1); \ | |
| 130 | (p2)->qforw = (p1); \ | |
| 131 | (p1)->qback = (p2); \ | |
| 132 | } while (0) | |
| 133 | ||
| 134 | #define qinsertq(q, p) \ | |
| 135 | ((q)->qforw == (q) ? 0 : \ | |
| 136 | ((q)->qback->qforw = (p)->qforw, \ | |
| 137 | (p)->qforw->qback = (q)->qback, \ | |
| 138 | (q)->qforw->qback = (p), \ | |
| 139 | (p)->qforw = (q)->qforw, \ | |
| 140 | (q)->qforw = (q), \ | |
| 141 | (q)->qback = (q))) | |
| 142 | ||
| 143 | #define H (14) | |
| 144 | #define HSIZE (1 << H) | |
| abecab39 | 145 | #define hash(h, c) (((((h) >> (H - 8)) | (h) << 8) ^ (c)) & (HSIZE - 1)) |
| 984263bc MD |
146 | |
| 147 | char *cc_buffer; | |
| 148 | struct cc **cc_output; /* the output array */ | |
| 149 | short *cc_places[TOKEN_MAX + 1]; | |
| 150 | short *cc_hashcodes; /* for computing hashcodes */ | |
| 151 | struct cc **cc_htab; /* the hash table */ | |
| 152 | struct cc **cc_tokens; /* holds all the active tokens */ | |
| 153 | struct cc_undo { | |
| 154 | struct cc **pos; | |
| 155 | struct cc *val; | |
| 156 | } *cc_undo; | |
| 157 | ||
| 158 | long cc_time, cc_time0; | |
| 159 | ||
| 160 | char *cc_tt_ob, *cc_tt_obe; | |
| 161 | ||
| abecab39 SW |
162 | int cc_compress(struct cc **, struct cc **, char); |
| 163 | void cc_compress_cleanup(struct cc **, int); | |
| 164 | void cc_compress_phase(struct cc **, int, struct cc **, int); | |
| 165 | void cc_compress_phase1(struct cc **, struct cc **, int, int); | |
| 166 | void cc_output_phase(char *, struct cc **, int); | |
| 167 | int cc_sweep(char *, int, struct cc **, int); | |
| 168 | void cc_sweep0(char *, int, int); | |
| 169 | int cc_sweep_phase(char *, int, struct cc **); | |
| 170 | void cc_sweep_reverse(struct cc **, short *); | |
| 171 | int cc_token_compare(const void *, const void *); | |
| 172 | ||
| 173 | int | |
| 174 | ccinit(void) | |
| 984263bc | 175 | { |
| abecab39 SW |
176 | int i, j; |
| 177 | struct cc *p; | |
| 984263bc MD |
178 | |
| 179 | if (tt.tt_token_max > cc_token_max) | |
| 180 | tt.tt_token_max = cc_token_max; | |
| 181 | if (tt.tt_token_min < cc_token_min) | |
| 182 | tt.tt_token_min = cc_token_min; | |
| 183 | if (tt.tt_token_min > tt.tt_token_max) { | |
| 184 | tt.tt_ntoken = 0; | |
| 185 | return 0; | |
| 186 | } | |
| 187 | if (tt.tt_ntoken > cc_ntoken / 2) /* not likely */ | |
| 188 | tt.tt_ntoken = cc_ntoken / 2; | |
| 189 | #define C(x) (sizeof (x) / sizeof *(x)) | |
| abecab39 | 190 | for (i = 0; i < (int)C(cc_thresholds); i++) { |
| 984263bc MD |
191 | int h = i - tt.tt_put_token_cost; |
| 192 | if (h > 0) | |
| 193 | cc_thresholds[i] = | |
| 194 | (tt.tt_set_token_cost + 1 + h - 1) / h + 1; | |
| 195 | else | |
| 196 | cc_thresholds[i] = 0; | |
| 197 | } | |
| abecab39 | 198 | for (i = 0; i < (int)C(cc_score_adjustments); i++) { |
| 984263bc | 199 | int t = cc_thresholds[i]; |
| abecab39 | 200 | for (j = 0; j < (int)C(*cc_score_adjustments); j++) { |
| 984263bc MD |
201 | if (j >= t) |
| 202 | cc_score_adjustments[i][j] = | |
| 203 | - (i - tt.tt_put_token_cost); | |
| 204 | else if (j < t - 1) | |
| 205 | cc_score_adjustments[i][j] = 0; | |
| 206 | else | |
| 207 | /* | |
| 208 | * cost now is | |
| 209 | * length * (ccount + 1) a | |
| 210 | * cost before was | |
| 211 | * set-token-cost + length + | |
| 212 | * ccount * put-token-cost b | |
| 213 | * the score adjustment is (b - a) | |
| 214 | */ | |
| 215 | cc_score_adjustments[i][j] = | |
| 216 | tt.tt_set_token_cost + i + | |
| 217 | j * tt.tt_put_token_cost - | |
| 218 | i * (j + 1); | |
| 219 | if (j >= t) | |
| 220 | cc_initial_scores[i][j] = 0; | |
| 221 | else | |
| 222 | /* | |
| 223 | * - (set-token-cost + | |
| 224 | * (length - put-token-cost) - | |
| 225 | * (length - put-token-cost) * ccount) | |
| 226 | */ | |
| 227 | cc_initial_scores[i][j] = | |
| 228 | - (tt.tt_set_token_cost + | |
| 229 | (i - tt.tt_put_token_cost) - | |
| 230 | (i - tt.tt_put_token_cost) * j); | |
| 231 | } | |
| 232 | } | |
| 233 | #ifndef cc_weight | |
| 234 | for (i = 1; i < C(cc_wthresholds); i++) { | |
| 235 | cc_wthresholds[i] = | |
| 236 | ((tt.tt_set_token_cost + tt.tt_put_token_cost) / i + | |
| 237 | i / 5 + 1) * | |
| 238 | cc_weight + 1; | |
| 239 | cc_wlimits[i] = cc_wthresholds[i] + cc_weight; | |
| 240 | } | |
| 241 | #endif | |
| 242 | #undef C | |
| 243 | if ((cc_output = (struct cc **) | |
| 244 | malloc((unsigned) cc_bufsize * sizeof *cc_output)) == 0) | |
| 245 | goto nomem; | |
| 246 | if ((cc_hashcodes = (short *) | |
| 247 | malloc((unsigned) cc_bufsize * sizeof *cc_hashcodes)) == 0) | |
| 248 | goto nomem; | |
| 249 | if ((cc_htab = (struct cc **) malloc(HSIZE * sizeof *cc_htab)) == 0) | |
| 250 | goto nomem; | |
| 251 | if ((cc_tokens = (struct cc **) | |
| 252 | malloc((unsigned) | |
| 253 | (cc_ntoken + tt.tt_token_max - tt.tt_token_min + 1) * | |
| 254 | sizeof *cc_tokens)) == 0) | |
| 255 | goto nomem; | |
| 256 | if ((cc_undo = (struct cc_undo *) | |
| 257 | malloc((unsigned) cc_bufsize * sizeof *cc_undo)) == 0) | |
| 258 | goto nomem; | |
| 259 | for (i = tt.tt_token_min; i <= tt.tt_token_max; i++) | |
| 260 | if ((cc_places[i] = (short *) | |
| 261 | malloc((unsigned) cc_bufsize * sizeof **cc_places)) == 0) | |
| 262 | goto nomem; | |
| 263 | cc_q0a.qforw = cc_q0a.qback = &cc_q0a; | |
| 264 | cc_q0b.qforw = cc_q0b.qback = &cc_q0b; | |
| 265 | cc_q1a.qforw = cc_q1a.qback = &cc_q1a; | |
| 266 | cc_q1b.qforw = cc_q1b.qback = &cc_q1b; | |
| 267 | if ((p = (struct cc *) malloc((unsigned) cc_ntoken * sizeof *p)) == 0) | |
| 268 | goto nomem; | |
| 269 | for (i = 0; i < tt.tt_ntoken; i++) { | |
| 270 | p->code = i; | |
| 271 | p->time = -1; | |
| 272 | p->qback = cc_q0a.qback; | |
| 273 | p->qforw = &cc_q0a; | |
| 274 | p->qback->qforw = p; | |
| 275 | cc_q0a.qback = p; | |
| 276 | p++; | |
| 277 | } | |
| 278 | for (; i < cc_ntoken; i++) { | |
| 279 | p->code = -1; | |
| 280 | p->time = -1; | |
| 281 | p->qback = cc_q1a.qback; | |
| 282 | p->qforw = &cc_q1a; | |
| 283 | p->qback->qforw = p; | |
| 284 | cc_q1a.qback = p; | |
| 285 | p++; | |
| 286 | } | |
| 287 | cc_tt_ob = tt_ob; | |
| 288 | cc_tt_obe = tt_obe; | |
| 289 | if ((cc_buffer = malloc((unsigned) cc_bufsize)) == 0) | |
| 290 | goto nomem; | |
| 291 | return 0; | |
| 292 | nomem: | |
| 293 | wwerrno = WWE_NOMEM; | |
| 294 | return -1; | |
| 295 | } | |
| 296 | ||
| abecab39 SW |
297 | void |
| 298 | ccstart(void) | |
| 984263bc | 299 | { |
| 984263bc MD |
300 | ttflush(); |
| 301 | tt_obp = tt_ob = cc_buffer; | |
| 302 | tt_obe = tt_ob + cc_bufsize; | |
| 303 | tt.tt_flush = ccflush; | |
| 304 | if (cc_trace) { | |
| 305 | cc_trace_fp = fopen("window-trace", "a"); | |
| 306 | (void) fcntl(fileno(cc_trace_fp), F_SETFD, 1); | |
| 307 | } | |
| 308 | ccreset(); | |
| 309 | } | |
| 310 | ||
| abecab39 SW |
311 | void |
| 312 | ccreset(void) | |
| 984263bc | 313 | { |
| abecab39 | 314 | struct cc *p; |
| 984263bc | 315 | |
| abecab39 | 316 | memset((char *) cc_htab, 0, HSIZE * sizeof *cc_htab); |
| 984263bc MD |
317 | for (p = cc_q0a.qforw; p != &cc_q0a; p = p->qforw) |
| 318 | p->hback = 0; | |
| 319 | for (p = cc_q1a.qforw; p != &cc_q1a; p = p->qforw) | |
| 320 | p->hback = 0; | |
| 321 | } | |
| 322 | ||
| abecab39 SW |
323 | void |
| 324 | ccend(void) | |
| 984263bc MD |
325 | { |
| 326 | ||
| 327 | ttflush(); | |
| 328 | tt_obp = tt_ob = cc_tt_ob; | |
| 329 | tt_obe = cc_tt_obe; | |
| 330 | tt.tt_flush = 0; | |
| 331 | if (cc_trace_fp != NULL) { | |
| 332 | (void) fclose(cc_trace_fp); | |
| 333 | cc_trace_fp = NULL; | |
| 334 | } | |
| 335 | } | |
| 336 | ||
| abecab39 SW |
337 | void |
| 338 | ccflush(void) | |
| 984263bc MD |
339 | { |
| 340 | int bufsize = tt_obp - tt_ob; | |
| 341 | int n; | |
| 342 | ||
| 343 | if (tt_ob != cc_buffer) | |
| 344 | abort(); | |
| 345 | if (cc_trace_fp != NULL) { | |
| 346 | (void) fwrite(tt_ob, 1, bufsize, cc_trace_fp); | |
| 347 | (void) putc(-1, cc_trace_fp); | |
| 348 | } | |
| 349 | tt.tt_flush = 0; | |
| 350 | (*tt.tt_compress)(1); | |
| 351 | if (bufsize < tt.tt_token_min) { | |
| 352 | ttflush(); | |
| 353 | goto out; | |
| 354 | } | |
| 355 | tt_obp = tt_ob = cc_tt_ob; | |
| 356 | tt_obe = cc_tt_obe; | |
| 357 | cc_time0 = cc_time; | |
| 358 | cc_time += bufsize; | |
| 359 | n = cc_sweep_phase(cc_buffer, bufsize, cc_tokens); | |
| 360 | cc_compress_phase(cc_output, bufsize, cc_tokens, n); | |
| 361 | cc_output_phase(cc_buffer, cc_output, bufsize); | |
| 362 | ttflush(); | |
| 363 | tt_obp = tt_ob = cc_buffer; | |
| 364 | tt_obe = cc_buffer + cc_bufsize; | |
| 365 | out: | |
| 366 | (*tt.tt_compress)(0); | |
| 367 | tt.tt_flush = ccflush; | |
| 368 | } | |
| 369 | ||
| abecab39 SW |
370 | int |
| 371 | cc_sweep_phase(char *buffer, int bufsize, struct cc **tokens) | |
| 984263bc | 372 | { |
| abecab39 SW |
373 | struct cc **pp = tokens; |
| 374 | int i, n; | |
| 984263bc MD |
375 | #ifdef STATS |
| 376 | int nn, ii; | |
| 377 | #endif | |
| 378 | ||
| 379 | #ifdef STATS | |
| 380 | if (verbose >= 0) | |
| 381 | time_begin(); | |
| 382 | if (verbose > 0) | |
| 383 | printf("Sweep:"); | |
| 384 | #endif | |
| 385 | cc_sweep0(buffer, bufsize, tt.tt_token_min - 1); | |
| 386 | #ifdef STATS | |
| 387 | ntoken_stat = 0; | |
| 388 | nn = 0; | |
| 389 | ii = 0; | |
| 390 | #endif | |
| 391 | for (i = tt.tt_token_min; i <= tt.tt_token_max; i++) { | |
| 392 | #ifdef STATS | |
| 393 | if (verbose > 0) { | |
| 394 | if (ii > 7) { | |
| 395 | printf("\n "); | |
| 396 | ii = 0; | |
| 397 | } | |
| 398 | ii++; | |
| 399 | printf(" (%d", i); | |
| 400 | (void) fflush(stdout); | |
| 401 | } | |
| 402 | #endif | |
| 403 | n = cc_sweep(buffer, bufsize, pp, i); | |
| 404 | pp += n; | |
| 405 | #ifdef STATS | |
| 406 | if (verbose > 0) { | |
| 407 | if (--n > 0) { | |
| 408 | printf(" %d", n); | |
| 409 | nn += n; | |
| 410 | } | |
| 411 | putchar(')'); | |
| 412 | } | |
| 413 | #endif | |
| 414 | } | |
| 415 | qinsertq(&cc_q1b, &cc_q1a); | |
| 416 | #ifdef STATS | |
| 417 | if (verbose > 0) | |
| 418 | printf("\n %d tokens, %d candidates\n", | |
| 419 | ntoken_stat, nn); | |
| 420 | if (verbose >= 0) | |
| 421 | time_end(); | |
| 422 | #endif | |
| 423 | return pp - tokens; | |
| 424 | } | |
| 425 | ||
| abecab39 SW |
426 | void |
| 427 | cc_sweep0(char *buffer, int n, int length) | |
| 984263bc | 428 | { |
| abecab39 SW |
429 | char *p; |
| 430 | short *hc; | |
| 431 | int i; | |
| 432 | short c; | |
| 433 | short pc = tt.tt_padc; | |
| 984263bc MD |
434 | |
| 435 | /* n and length are at least 1 */ | |
| 436 | p = buffer++; | |
| 437 | hc = cc_hashcodes; | |
| 438 | i = n; | |
| 439 | do { | |
| 440 | if ((*hc++ = *p++) == pc) | |
| 441 | hc[-1] = -1; | |
| 442 | } while (--i); | |
| 443 | while (--length) { | |
| 444 | p = buffer++; | |
| 445 | hc = cc_hashcodes; | |
| 446 | for (i = n--; --i;) { | |
| 447 | if ((c = *p++) == pc || *hc < 0) | |
| 448 | c = -1; | |
| 449 | else | |
| 450 | c = hash(*hc, c); | |
| 451 | *hc++ = c; | |
| 452 | } | |
| 453 | } | |
| 454 | } | |
| 455 | ||
| abecab39 SW |
456 | int |
| 457 | cc_sweep(char *buffer, int bufsize, struct cc **tokens, int length) | |
| 984263bc | 458 | { |
| abecab39 SW |
459 | struct cc *p; |
| 460 | char *cp; | |
| 461 | int i; | |
| 984263bc MD |
462 | short *hc; |
| 463 | short *places = cc_places[length]; | |
| 464 | struct cc **pp = tokens; | |
| 465 | short threshold = thresh(length); | |
| 466 | #ifndef cc_weight | |
| 467 | short wthreshold = wthresh(length); | |
| 468 | short limit = wlimit(length); | |
| 469 | #endif | |
| abecab39 | 470 | int time0; |
| 984263bc MD |
471 | short pc = tt.tt_padc; |
| 472 | ||
| 473 | i = length - 1; | |
| 474 | bufsize -= i; | |
| 475 | cp = buffer + i; | |
| 476 | hc = cc_hashcodes; | |
| abecab39 SW |
477 | time0 = cc_time0; |
| 478 | for (i = 0; i < bufsize; i++, time0++) { | |
| 984263bc MD |
479 | struct cc **h; |
| 480 | ||
| 481 | { | |
| abecab39 SW |
482 | short *hc1 = hc; |
| 483 | short c = *cp++; | |
| 484 | short hh; | |
| 984263bc MD |
485 | if ((hh = *hc1) < 0 || c == pc) { |
| 486 | *hc1++ = -1; | |
| 487 | hc = hc1; | |
| 488 | continue; | |
| 489 | } | |
| 490 | h = cc_htab + (*hc1++ = hash(hh, c)); | |
| 491 | hc = hc1; | |
| 492 | } | |
| 493 | for (p = *h; p != 0; p = p->hforw) | |
| 494 | if (p->length == (char) length) { | |
| abecab39 SW |
495 | char *p1 = p->string; |
| 496 | char *p2 = cp - length; | |
| 497 | int n = length; | |
| 984263bc MD |
498 | do |
| 499 | if (*p1++ != *p2++) | |
| 500 | goto fail; | |
| 501 | while (--n); | |
| 502 | break; | |
| 503 | fail:; | |
| 504 | } | |
| 505 | if (p == 0) { | |
| 506 | p = cc_q1a.qback; | |
| 507 | if (p == &cc_q1a || | |
| abecab39 | 508 | (p->time >= cc_time0 && p->length == (char) length)) |
| 984263bc MD |
509 | continue; |
| 510 | if (p->hback != 0) | |
| 511 | if ((*p->hback = p->hforw) != 0) | |
| 512 | p->hforw->hback = p->hback; | |
| 513 | { | |
| abecab39 SW |
514 | char *p1 = p->string; |
| 515 | char *p2 = cp - length; | |
| 516 | int n = length; | |
| 984263bc MD |
517 | do |
| 518 | *p1++ = *p2++; | |
| 519 | while (--n); | |
| 520 | } | |
| 521 | p->length = length; | |
| 522 | #ifndef cc_weight | |
| 523 | p->weight = cc_weight; | |
| 524 | #endif | |
| abecab39 | 525 | p->time = time0; |
| 984263bc MD |
526 | p->bcount = 1; |
| 527 | p->ccount = 0; | |
| 528 | p->flag = 0; | |
| 529 | if ((p->hforw = *h) != 0) | |
| 530 | p->hforw->hback = &p->hforw; | |
| 531 | *h = p; | |
| 532 | p->hback = h; | |
| 533 | qinsert(p, &cc_q1a); | |
| 534 | places[i] = -1; | |
| 535 | p->places = i; | |
| 536 | #ifdef STATS | |
| 537 | ntoken_stat++; | |
| 538 | #endif | |
| 539 | } else if (p->time < cc_time0) { | |
| 540 | #ifndef cc_weight | |
| abecab39 | 541 | if ((p->weight += p->time - time0) < 0) |
| 984263bc MD |
542 | p->weight = cc_weight; |
| 543 | else if ((p->weight += cc_weight) > limit) | |
| 544 | p->weight = limit; | |
| 545 | #endif | |
| abecab39 | 546 | p->time = time0; |
| 984263bc MD |
547 | p->bcount = 1; |
| 548 | p->ccount = 0; | |
| 549 | if (p->code >= 0) { | |
| 550 | p->flag = 1; | |
| 551 | *pp++ = p; | |
| 552 | } else | |
| 553 | #ifndef cc_weight | |
| 554 | if (p->weight >= wthreshold) { | |
| 555 | p->flag = 1; | |
| 556 | *pp++ = p; | |
| 557 | qinsert(p, &cc_q1b); | |
| 558 | } else | |
| 559 | #endif | |
| 560 | { | |
| 561 | p->flag = 0; | |
| 562 | qinsert(p, &cc_q1a); | |
| 563 | } | |
| 564 | places[i] = -1; | |
| 565 | p->places = i; | |
| 566 | #ifdef STATS | |
| 567 | ntoken_stat++; | |
| 568 | #endif | |
| abecab39 | 569 | } else if (p->time + length > time0) { |
| 984263bc MD |
570 | /* |
| 571 | * overlapping token, don't count as two and | |
| abecab39 | 572 | * don't update time0, but do adjust weight to offset |
| 984263bc MD |
573 | * the difference |
| 574 | */ | |
| 575 | #ifndef cc_weight | |
| 576 | if (cc_weight != 0) { /* XXX */ | |
| abecab39 | 577 | p->weight += time0 - p->time; |
| 984263bc MD |
578 | if (!p->flag && p->weight >= wthreshold) { |
| 579 | p->flag = 1; | |
| 580 | *pp++ = p; | |
| 581 | qinsert(p, &cc_q1b); | |
| 582 | } | |
| 583 | } | |
| 584 | #endif | |
| 585 | places[i] = p->places; | |
| 586 | p->places = i; | |
| 587 | } else { | |
| 588 | #ifndef cc_weight | |
| abecab39 | 589 | if ((p->weight += p->time - time0) < 0) |
| 984263bc MD |
590 | p->weight = cc_weight; |
| 591 | else if ((p->weight += cc_weight) > limit) | |
| 592 | p->weight = limit; | |
| 593 | #endif | |
| abecab39 | 594 | p->time = time0; |
| 984263bc MD |
595 | p->bcount++; |
| 596 | if (!p->flag && | |
| 597 | /* code must be < 0 if flag false here */ | |
| 598 | (p->bcount >= threshold | |
| 599 | #ifndef cc_weight | |
| 600 | || p->weight >= wthreshold | |
| 601 | #endif | |
| 602 | )) { | |
| 603 | p->flag = 1; | |
| 604 | *pp++ = p; | |
| 605 | qinsert(p, &cc_q1b); | |
| 606 | } | |
| 607 | places[i] = p->places; | |
| 608 | p->places = i; | |
| 609 | } | |
| 610 | } | |
| 611 | if ((i = pp - tokens) > 0) { | |
| 612 | *pp = 0; | |
| 613 | if (cc_reverse) | |
| 614 | cc_sweep_reverse(tokens, places); | |
| 615 | if (cc_sort && i > 1) { | |
| 984263bc MD |
616 | qsort((char *) tokens, i, sizeof *tokens, |
| 617 | cc_token_compare); | |
| 618 | } | |
| 619 | if (cc_chop) { | |
| 620 | if ((i = i * cc_chop / 100) == 0) | |
| 621 | i = 1; | |
| 622 | tokens[i] = 0; | |
| 623 | } | |
| 624 | i++; | |
| 625 | } | |
| 626 | return i; | |
| 627 | } | |
| 628 | ||
| abecab39 SW |
629 | void |
| 630 | cc_sweep_reverse(struct cc **pp, short int *places) | |
| 984263bc | 631 | { |
| abecab39 SW |
632 | struct cc *p; |
| 633 | short frnt, back, t; | |
| 984263bc MD |
634 | |
| 635 | while ((p = *pp++) != 0) { | |
| 636 | back = -1; | |
| 637 | t = p->places; | |
| 638 | /* the list is never empty */ | |
| 639 | do { | |
| abecab39 | 640 | frnt = places[t]; |
| 984263bc MD |
641 | places[t] = back; |
| 642 | back = t; | |
| abecab39 | 643 | } while ((t = frnt) >= 0); |
| 984263bc MD |
644 | p->places = back; |
| 645 | } | |
| 646 | } | |
| 647 | ||
| abecab39 SW |
648 | void |
| 649 | cc_compress_phase(struct cc **output, int bufsize, struct cc **tokens, int ntoken) | |
| 984263bc | 650 | { |
| abecab39 | 651 | int i; |
| 984263bc | 652 | |
| abecab39 | 653 | memset((char *) output, 0, bufsize * sizeof *output); |
| 984263bc MD |
654 | for (i = 0; i < cc_npass0; i++) |
| 655 | cc_compress_phase1(output, tokens, ntoken, 0); | |
| 656 | for (i = 0; i < cc_npass1; i++) | |
| 657 | cc_compress_phase1(output, tokens, ntoken, 1); | |
| 658 | cc_compress_cleanup(output, bufsize); | |
| 659 | } | |
| 660 | ||
| abecab39 SW |
661 | void |
| 662 | cc_compress_phase1(struct cc **output, struct cc **tokens, int ntoken, int flag) | |
| 984263bc | 663 | { |
| abecab39 | 664 | struct cc **pp; |
| 984263bc | 665 | #ifdef STATS |
| abecab39 | 666 | int i = 0; |
| 984263bc MD |
667 | int nt = 0, cc = 0, nc = 0; |
| 668 | #endif | |
| 669 | ||
| 670 | #ifdef STATS | |
| 671 | if (verbose >= 0) | |
| 672 | time_begin(); | |
| 673 | if (verbose > 0) | |
| 674 | printf("Compress:"); | |
| 675 | #endif | |
| 676 | pp = tokens; | |
| 677 | while (pp < tokens + ntoken) { | |
| 678 | #ifdef STATS | |
| 679 | if (verbose > 0) { | |
| 680 | ntoken_stat = 0; | |
| 681 | ccount_stat = 0; | |
| 682 | ncover_stat = 0; | |
| 683 | if (i > 2) { | |
| 684 | printf("\n "); | |
| 685 | i = 0; | |
| 686 | } | |
| 687 | i++; | |
| 688 | printf(" (%d", (*pp)->length); | |
| 689 | (void) fflush(stdout); | |
| 690 | } | |
| 691 | #endif | |
| 692 | pp += cc_compress(output, pp, flag); | |
| 693 | #ifdef STATS | |
| 694 | if (verbose > 0) { | |
| 695 | printf(" %dt %du %dc)", ntoken_stat, ccount_stat, | |
| 696 | ncover_stat); | |
| 697 | nt += ntoken_stat; | |
| 698 | cc += ccount_stat; | |
| 699 | nc += ncover_stat; | |
| 700 | } | |
| 701 | #endif | |
| 702 | } | |
| 703 | #ifdef STATS | |
| 704 | if (verbose > 0) | |
| 705 | printf("\n total: (%dt %du %dc)\n", nt, cc, nc); | |
| 706 | if (verbose >= 0) | |
| 707 | time_end(); | |
| 708 | #endif | |
| 709 | } | |
| 710 | ||
| abecab39 SW |
711 | void |
| 712 | cc_compress_cleanup(struct cc **output, int bufsize) | |
| 984263bc | 713 | { |
| abecab39 | 714 | struct cc **end; |
| 984263bc MD |
715 | |
| 716 | /* the previous output phase may have been interrupted */ | |
| 717 | qinsertq(&cc_q0b, &cc_q0a); | |
| 718 | for (end = output + bufsize; output < end;) { | |
| abecab39 SW |
719 | struct cc *p; |
| 720 | int length; | |
| 984263bc MD |
721 | if ((p = *output) == 0) { |
| 722 | output++; | |
| 723 | continue; | |
| 724 | } | |
| 725 | length = p->length; | |
| 726 | if (!p->flag) { | |
| 727 | } else if (p->code >= 0) { | |
| 728 | qinsert(p, &cc_q0b); | |
| 729 | p->flag = 0; | |
| 730 | } else if (p->ccount == 0) { | |
| 731 | *output = 0; | |
| 732 | } else if (p->ccount >= thresh(length) | |
| 733 | #ifndef cc_weight | |
| 734 | || wthreshp(p->weight, length) | |
| 735 | #endif | |
| 736 | ) { | |
| 737 | p->flag = 0; | |
| 738 | } else { | |
| 739 | p->ccount = 0; | |
| 740 | *output = 0; | |
| 741 | } | |
| 742 | output += length; | |
| 743 | } | |
| 744 | } | |
| 745 | ||
| abecab39 SW |
746 | int |
| 747 | cc_compress(struct cc **output, struct cc **tokens, char flag) | |
| 984263bc MD |
748 | { |
| 749 | struct cc **pp = tokens; | |
| abecab39 | 750 | struct cc *p = *pp++; |
| 984263bc MD |
751 | int length = p->length; |
| 752 | int threshold = thresh(length); | |
| 753 | #ifndef cc_weight | |
| 754 | short wthreshold = wthresh(length); | |
| 755 | #endif | |
| 756 | short *places = cc_places[length]; | |
| 757 | int *initial_scores = cc_initial_scores[length]; | |
| 758 | int initial_score0 = put_token_score(length); | |
| 759 | ||
| 760 | do { | |
| 761 | int score; | |
| abecab39 | 762 | struct cc_undo *undop; |
| 984263bc MD |
763 | int ccount; |
| 764 | #ifdef STATS | |
| 765 | int ncover; | |
| 766 | #endif | |
| 767 | int i; | |
| 768 | ||
| 769 | ccount = p->ccount; | |
| 770 | if ((short) ccount >= p->bcount) | |
| 771 | continue; | |
| 772 | if (p->code >= 0 || ccount >= threshold) | |
| 773 | score = 0; | |
| 774 | #ifndef cc_weight | |
| 775 | else if (p->weight >= wthreshold) | |
| 776 | /* allow one fewer match than normal */ | |
| 777 | /* XXX, should adjust for ccount */ | |
| 778 | score = - tt.tt_set_token_cost; | |
| 779 | #endif | |
| 780 | else | |
| 781 | score = initial_scores[ccount]; | |
| 782 | undop = cc_undo; | |
| 783 | #ifdef STATS | |
| 784 | ncover = 0; | |
| 785 | #endif | |
| 786 | for (i = p->places; i >= 0; i = places[i]) { | |
| abecab39 SW |
787 | struct cc **jp; |
| 788 | struct cc *x; | |
| 789 | struct cc **ip = output + i; | |
| 790 | int score0 = initial_score0; | |
| 984263bc MD |
791 | struct cc **iip = ip + length; |
| 792 | struct cc_undo *undop1 = undop; | |
| 793 | ||
| 794 | if ((x = *(jp = ip)) != 0) | |
| 795 | goto z; | |
| 796 | while (--jp >= output) | |
| 797 | if ((x = *jp) != 0) { | |
| 798 | if (jp + x->length > ip) | |
| 799 | goto z; | |
| 800 | break; | |
| 801 | } | |
| 802 | jp = ip + 1; | |
| 803 | while (jp < iip) { | |
| 804 | if ((x = *jp) == 0) { | |
| 805 | jp++; | |
| 806 | continue; | |
| 807 | } | |
| 808 | z: | |
| 809 | if (x == p) | |
| 810 | goto undo; | |
| 811 | #ifdef STATS | |
| 812 | ncover++; | |
| 813 | #endif | |
| 814 | undop->pos = jp; | |
| 815 | undop->val = x; | |
| 816 | undop++; | |
| 817 | *jp = 0; | |
| 818 | x->ccount--; | |
| 819 | score_adjust(score0, x); | |
| 820 | if (score0 < 0 && flag) | |
| 821 | goto undo; | |
| 822 | jp += x->length; | |
| 823 | } | |
| 824 | undop->pos = ip; | |
| 825 | undop->val = 0; | |
| 826 | undop++; | |
| 827 | *ip = p; | |
| 828 | ccount++; | |
| 829 | score += score0; | |
| 830 | continue; | |
| 831 | undo: | |
| 832 | while (--undop >= undop1) | |
| abecab39 | 833 | if ((*undop->pos = x = undop->val)) |
| 984263bc MD |
834 | x->ccount++; |
| 835 | undop++; | |
| 836 | } | |
| 837 | if (score > 0) { | |
| 838 | #ifdef STATS | |
| 839 | ccount_stat += ccount - p->ccount; | |
| 840 | ntoken_stat++; | |
| 841 | ncover_stat += ncover; | |
| 842 | #endif | |
| 843 | p->ccount = ccount; | |
| 844 | } else { | |
| abecab39 | 845 | struct cc_undo *u = cc_undo; |
| 984263bc | 846 | while (--undop >= u) { |
| abecab39 SW |
847 | struct cc *x; |
| 848 | if ((*undop->pos = x = undop->val)) | |
| 984263bc MD |
849 | x->ccount++; |
| 850 | } | |
| 851 | } | |
| 852 | } while ((p = *pp++) != 0); | |
| 853 | return pp - tokens; | |
| 854 | } | |
| 855 | ||
| abecab39 SW |
856 | void |
| 857 | cc_output_phase(char *buffer, struct cc **output, int bufsize) | |
| 984263bc | 858 | { |
| abecab39 SW |
859 | int i; |
| 860 | struct cc *p, *p1; | |
| 984263bc MD |
861 | |
| 862 | for (i = 0; i < bufsize;) { | |
| 863 | if ((p = output[i]) == 0) { | |
| 864 | ttputc(buffer[i]); | |
| 865 | i++; | |
| 866 | } else if (p->code >= 0) { | |
| 867 | if (--p->ccount == 0) | |
| 868 | qinsert(p, &cc_q0a); | |
| 869 | (*tt.tt_put_token)(p->code, p->string, p->length); | |
| 870 | wwntokuse++; | |
| 871 | wwntoksave += put_token_score(p->length); | |
| 872 | i += p->length; | |
| 873 | } else if ((p1 = cc_q0a.qback) != &cc_q0a) { | |
| 874 | p->code = p1->code; | |
| 875 | p1->code = -1; | |
| 876 | qinsert(p1, &cc_q1a); | |
| 877 | if (--p->ccount == 0) | |
| 878 | qinsert(p, &cc_q0a); | |
| 879 | else | |
| 880 | qinsert(p, &cc_q0b); | |
| 881 | (*tt.tt_set_token)(p->code, p->string, p->length); | |
| 882 | wwntokdef++; | |
| 883 | wwntoksave -= tt.tt_set_token_cost; | |
| 884 | i += p->length; | |
| 885 | } else { | |
| 886 | p->ccount--; | |
| 887 | ttwrite(p->string, p->length); | |
| 888 | wwntokbad++; | |
| 889 | i += p->length; | |
| 890 | } | |
| 891 | } | |
| 892 | wwntokc += bufsize; | |
| 893 | } | |
| 894 | ||
| abecab39 SW |
895 | int |
| 896 | cc_token_compare(const void *p1, const void *p2) | |
| 984263bc | 897 | { |
| abecab39 SW |
898 | const struct cc * const * vp1 = p1; |
| 899 | const struct cc * const * vp2 = p2; | |
| 900 | return (*vp2)->bcount - (*vp1)->bcount; | |
| 984263bc | 901 | } |