1 /* $NetBSD: compress.c,v 1.7 2009/04/14 08:50:06 lukem Exp $ */
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Edward Wang at The University of California, Berkeley.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #include <sys/cdefs.h>
38 static char sccsid[] = "@(#)compress.c 8.1 (Berkeley) 6/6/93";
40 __RCSID("$NetBSD: compress.c,v 1.7 2009/04/14 08:50:06 lukem Exp $");
55 /* tunable parameters */
61 int cc_token_max = 8; /* <= TOKEN_MAX */
62 int cc_token_min = 2; /* > tt.tt_put_token_cost */
66 int cc_bufsize = 1024 * 3; /* XXX, or 80 * 24 * 2 */
78 char string[TOKEN_MAX];
84 long time; /* time last seen */
85 short bcount; /* count in this buffer */
86 short ccount; /* count in compression */
87 short places; /* places in the buffer */
88 short code; /* token code */
89 struct cc *qforw, *qback;
90 struct cc *hforw, **hback;
93 short cc_thresholds[TOKEN_MAX + 1];
94 #define thresh(length) (cc_thresholds[length])
95 #define threshp(code, count, length) \
96 ((code) >= 0 || (short) (count) >= cc_thresholds[length])
99 short cc_wthresholds[TOKEN_MAX + 1];
100 #define wthresh(length) (cc_wthresholds[length])
101 #define wthreshp(weight, length) ((short) (weight) >= cc_wthresholds[length])
103 #define wthreshp(weight, length) (0)
107 short cc_wlimits[TOKEN_MAX + 1];
108 #define wlimit(length) (cc_wlimits[length])
111 #define put_token_score(length) ((length) - tt.tt_put_token_cost)
113 int cc_score_adjustments[TOKEN_MAX + 1][8]; /* XXX, 8 > max of cc_thresholds */
114 #define score_adjust(score, p) \
116 int _length = (p)->length; \
117 int _ccount = (p)->ccount; \
118 if (threshp((p)->code, _ccount, _length) || \
119 wthreshp((p)->weight, _length)) /* XXX */ \
120 (score) -= _length - tt.tt_put_token_cost; \
122 (score) += cc_score_adjustments[_length][_ccount]; \
125 int cc_initial_scores[TOKEN_MAX + 1][8]; /* XXX, 8 > max of cc_thresholds */
127 struct cc cc_q0a, cc_q0b, cc_q1a, cc_q1b;
129 #define qinsert(p1, p2) \
131 struct cc *forw = (p1)->qforw; \
132 struct cc *back = (p1)->qback; \
133 back->qforw = forw; \
134 forw->qback = back; \
135 forw = (p2)->qforw; \
136 (p1)->qforw = forw; \
137 forw->qback = (p1); \
138 (p2)->qforw = (p1); \
139 (p1)->qback = (p2); \
142 #define qinsertq(q, p) \
143 ((q)->qforw == (q) ? 0 : \
144 ((q)->qback->qforw = (p)->qforw, \
145 (p)->qforw->qback = (q)->qback, \
146 (q)->qforw->qback = (p), \
147 (p)->qforw = (q)->qforw, \
152 #define HSIZE (1 << H)
153 #define hash(h, c) (((((h) >> (H - 8)) | (h) << 8) ^ (c)) & (HSIZE - 1))
156 struct cc **cc_output; /* the output array */
157 short *cc_places[TOKEN_MAX + 1];
158 short *cc_hashcodes; /* for computing hashcodes */
159 struct cc **cc_htab; /* the hash table */
160 struct cc **cc_tokens; /* holds all the active tokens */
166 long cc_time, cc_time0;
168 char *cc_tt_ob, *cc_tt_obe;
170 int cc_compress(struct cc **, struct cc **, char);
171 void cc_compress_cleanup(struct cc **, int);
172 void cc_compress_phase(struct cc **, int, struct cc **, int);
173 void cc_compress_phase1(struct cc **, struct cc **, int, int);
174 void cc_output_phase(char *, struct cc **, int);
175 int cc_sweep(char *, int, struct cc **, int);
176 void cc_sweep0(char *, int, int);
177 int cc_sweep_phase(char *, int, struct cc **);
178 void cc_sweep_reverse(struct cc **, short *);
179 int cc_token_compare(const void *, const void *);
187 if (tt.tt_token_max > cc_token_max)
188 tt.tt_token_max = cc_token_max;
189 if (tt.tt_token_min < cc_token_min)
190 tt.tt_token_min = cc_token_min;
191 if (tt.tt_token_min > tt.tt_token_max) {
195 if (tt.tt_ntoken > cc_ntoken / 2) /* not likely */
196 tt.tt_ntoken = cc_ntoken / 2;
197 #define C(x) (sizeof (x) / sizeof *(x))
198 for (i = 0; i < (int)C(cc_thresholds); i++) {
199 int h = i - tt.tt_put_token_cost;
202 (tt.tt_set_token_cost + 1 + h - 1) / h + 1;
204 cc_thresholds[i] = 0;
206 for (i = 0; i < (int)C(cc_score_adjustments); i++) {
207 int t = cc_thresholds[i];
208 for (j = 0; j < (int)C(*cc_score_adjustments); j++) {
210 cc_score_adjustments[i][j] =
211 - (i - tt.tt_put_token_cost);
213 cc_score_adjustments[i][j] = 0;
217 * length * (ccount + 1) a
219 * set-token-cost + length +
220 * ccount * put-token-cost b
221 * the score adjustment is (b - a)
223 cc_score_adjustments[i][j] =
224 tt.tt_set_token_cost + i +
225 j * tt.tt_put_token_cost -
228 cc_initial_scores[i][j] = 0;
231 * - (set-token-cost +
232 * (length - put-token-cost) -
233 * (length - put-token-cost) * ccount)
235 cc_initial_scores[i][j] =
236 - (tt.tt_set_token_cost +
237 (i - tt.tt_put_token_cost) -
238 (i - tt.tt_put_token_cost) * j);
242 for (i = 1; i < C(cc_wthresholds); i++) {
244 ((tt.tt_set_token_cost + tt.tt_put_token_cost) / i +
247 cc_wlimits[i] = cc_wthresholds[i] + cc_weight;
251 if ((cc_output = (struct cc **)
252 malloc((unsigned) cc_bufsize * sizeof *cc_output)) == 0)
254 if ((cc_hashcodes = (short *)
255 malloc((unsigned) cc_bufsize * sizeof *cc_hashcodes)) == 0)
257 if ((cc_htab = (struct cc **) malloc(HSIZE * sizeof *cc_htab)) == 0)
259 if ((cc_tokens = (struct cc **)
261 (cc_ntoken + tt.tt_token_max - tt.tt_token_min + 1) *
262 sizeof *cc_tokens)) == 0)
264 if ((cc_undo = (struct cc_undo *)
265 malloc((unsigned) cc_bufsize * sizeof *cc_undo)) == 0)
267 for (i = tt.tt_token_min; i <= tt.tt_token_max; i++)
268 if ((cc_places[i] = (short *)
269 malloc((unsigned) cc_bufsize * sizeof **cc_places)) == 0)
271 cc_q0a.qforw = cc_q0a.qback = &cc_q0a;
272 cc_q0b.qforw = cc_q0b.qback = &cc_q0b;
273 cc_q1a.qforw = cc_q1a.qback = &cc_q1a;
274 cc_q1b.qforw = cc_q1b.qback = &cc_q1b;
275 if ((p = (struct cc *) malloc((unsigned) cc_ntoken * sizeof *p)) == 0)
277 for (i = 0; i < tt.tt_ntoken; i++) {
280 p->qback = cc_q0a.qback;
286 for (; i < cc_ntoken; i++) {
289 p->qback = cc_q1a.qback;
297 if ((cc_buffer = malloc((unsigned) cc_bufsize)) == 0)
309 tt_obp = tt_ob = cc_buffer;
310 tt_obe = tt_ob + cc_bufsize;
311 tt.tt_flush = ccflush;
313 cc_trace_fp = fopen("window-trace", "a");
314 (void) fcntl(fileno(cc_trace_fp), F_SETFD, 1);
324 memset((char *) cc_htab, 0, HSIZE * sizeof *cc_htab);
325 for (p = cc_q0a.qforw; p != &cc_q0a; p = p->qforw)
327 for (p = cc_q1a.qforw; p != &cc_q1a; p = p->qforw)
336 tt_obp = tt_ob = cc_tt_ob;
339 if (cc_trace_fp != NULL) {
340 (void) fclose(cc_trace_fp);
348 int bufsize = tt_obp - tt_ob;
351 if (tt_ob != cc_buffer)
353 if (cc_trace_fp != NULL) {
354 (void) fwrite(tt_ob, 1, bufsize, cc_trace_fp);
355 (void) putc(-1, cc_trace_fp);
358 (*tt.tt_compress)(1);
359 if (bufsize < tt.tt_token_min) {
363 tt_obp = tt_ob = cc_tt_ob;
367 n = cc_sweep_phase(cc_buffer, bufsize, cc_tokens);
368 cc_compress_phase(cc_output, bufsize, cc_tokens, n);
369 cc_output_phase(cc_buffer, cc_output, bufsize);
371 tt_obp = tt_ob = cc_buffer;
372 tt_obe = cc_buffer + cc_bufsize;
374 (*tt.tt_compress)(0);
375 tt.tt_flush = ccflush;
379 cc_sweep_phase(char *buffer, int bufsize, struct cc **tokens)
381 struct cc **pp = tokens;
393 cc_sweep0(buffer, bufsize, tt.tt_token_min - 1);
399 for (i = tt.tt_token_min; i <= tt.tt_token_max; i++) {
408 (void) fflush(stdout);
411 n = cc_sweep(buffer, bufsize, pp, i);
423 qinsertq(&cc_q1b, &cc_q1a);
426 printf("\n %d tokens, %d candidates\n",
435 cc_sweep0(char *buffer, int n, int length)
441 short pc = tt.tt_padc;
443 /* n and length are at least 1 */
448 if ((*hc++ = *p++) == pc)
454 for (i = n--; --i;) {
455 if ((c = *p++) == pc || *hc < 0)
465 cc_sweep(char *buffer, int bufsize, struct cc **tokens, int length)
471 short *places = cc_places[length];
472 struct cc **pp = tokens;
473 short threshold = thresh(length);
475 short wthreshold = wthresh(length);
476 short limit = wlimit(length);
479 short pc = tt.tt_padc;
486 for (i = 0; i < bufsize; i++, time0++) {
493 if ((hh = *hc1) < 0 || c == pc) {
498 h = cc_htab + (*hc1++ = hash(hh, c));
501 for (p = *h; p != 0; p = p->hforw)
502 if (p->length == (char) length) {
503 char *p1 = p->string;
504 char *p2 = cp - length;
516 (p->time >= cc_time0 && p->length == (char) length))
519 if ((*p->hback = p->hforw) != 0)
520 p->hforw->hback = p->hback;
522 char *p1 = p->string;
523 char *p2 = cp - length;
531 p->weight = cc_weight;
537 if ((p->hforw = *h) != 0)
538 p->hforw->hback = &p->hforw;
547 } else if (p->time < cc_time0) {
549 if ((p->weight += p->time - time0) < 0)
550 p->weight = cc_weight;
551 else if ((p->weight += cc_weight) > limit)
562 if (p->weight >= wthreshold) {
577 } else if (p->time + length > time0) {
579 * overlapping token, don't count as two and
580 * don't update time0, but do adjust weight to offset
584 if (cc_weight != 0) { /* XXX */
585 p->weight += time0 - p->time;
586 if (!p->flag && p->weight >= wthreshold) {
593 places[i] = p->places;
597 if ((p->weight += p->time - time0) < 0)
598 p->weight = cc_weight;
599 else if ((p->weight += cc_weight) > limit)
605 /* code must be < 0 if flag false here */
606 (p->bcount >= threshold
608 || p->weight >= wthreshold
615 places[i] = p->places;
619 if ((i = pp - tokens) > 0) {
622 cc_sweep_reverse(tokens, places);
623 if (cc_sort && i > 1) {
624 qsort((char *) tokens, i, sizeof *tokens,
628 if ((i = i * cc_chop / 100) == 0)
638 cc_sweep_reverse(struct cc **pp, short int *places)
643 while ((p = *pp++) != 0) {
646 /* the list is never empty */
651 } while ((t = frnt) >= 0);
657 cc_compress_phase(struct cc **output, int bufsize, struct cc **tokens, int ntoken)
661 memset((char *) output, 0, bufsize * sizeof *output);
662 for (i = 0; i < cc_npass0; i++)
663 cc_compress_phase1(output, tokens, ntoken, 0);
664 for (i = 0; i < cc_npass1; i++)
665 cc_compress_phase1(output, tokens, ntoken, 1);
666 cc_compress_cleanup(output, bufsize);
670 cc_compress_phase1(struct cc **output, struct cc **tokens, int ntoken, int flag)
675 int nt = 0, cc = 0, nc = 0;
685 while (pp < tokens + ntoken) {
696 printf(" (%d", (*pp)->length);
697 (void) fflush(stdout);
700 pp += cc_compress(output, pp, flag);
703 printf(" %dt %du %dc)", ntoken_stat, ccount_stat,
713 printf("\n total: (%dt %du %dc)\n", nt, cc, nc);
720 cc_compress_cleanup(struct cc **output, int bufsize)
724 /* the previous output phase may have been interrupted */
725 qinsertq(&cc_q0b, &cc_q0a);
726 for (end = output + bufsize; output < end;) {
729 if ((p = *output) == 0) {
735 } else if (p->code >= 0) {
738 } else if (p->ccount == 0) {
740 } else if (p->ccount >= thresh(length)
742 || wthreshp(p->weight, length)
755 cc_compress(struct cc **output, struct cc **tokens, char flag)
757 struct cc **pp = tokens;
758 struct cc *p = *pp++;
759 int length = p->length;
760 int threshold = thresh(length);
762 short wthreshold = wthresh(length);
764 short *places = cc_places[length];
765 int *initial_scores = cc_initial_scores[length];
766 int initial_score0 = put_token_score(length);
770 struct cc_undo *undop;
778 if ((short) ccount >= p->bcount)
780 if (p->code >= 0 || ccount >= threshold)
783 else if (p->weight >= wthreshold)
784 /* allow one fewer match than normal */
785 /* XXX, should adjust for ccount */
786 score = - tt.tt_set_token_cost;
789 score = initial_scores[ccount];
794 for (i = p->places; i >= 0; i = places[i]) {
797 struct cc **ip = output + i;
798 int score0 = initial_score0;
799 struct cc **iip = ip + length;
800 struct cc_undo *undop1 = undop;
802 if ((x = *(jp = ip)) != 0)
804 while (--jp >= output)
805 if ((x = *jp) != 0) {
806 if (jp + x->length > ip)
812 if ((x = *jp) == 0) {
827 score_adjust(score0, x);
828 if (score0 < 0 && flag)
840 while (--undop >= undop1)
841 if ((*undop->pos = x = undop->val))
847 ccount_stat += ccount - p->ccount;
849 ncover_stat += ncover;
853 struct cc_undo *u = cc_undo;
854 while (--undop >= u) {
856 if ((*undop->pos = x = undop->val))
860 } while ((p = *pp++) != 0);
865 cc_output_phase(char *buffer, struct cc **output, int bufsize)
870 for (i = 0; i < bufsize;) {
871 if ((p = output[i]) == 0) {
874 } else if (p->code >= 0) {
875 if (--p->ccount == 0)
877 (*tt.tt_put_token)(p->code, p->string, p->length);
879 wwntoksave += put_token_score(p->length);
881 } else if ((p1 = cc_q0a.qback) != &cc_q0a) {
884 qinsert(p1, &cc_q1a);
885 if (--p->ccount == 0)
889 (*tt.tt_set_token)(p->code, p->string, p->length);
891 wwntoksave -= tt.tt_set_token_cost;
895 ttwrite(p->string, p->length);
904 cc_token_compare(const void *p1, const void *p2)
906 const struct cc * const * vp1 = p1;
907 const struct cc * const * vp2 = p2;
908 return (*vp2)->bcount - (*vp1)->bcount;