2 Copyright (C) 1989, 1990, 1991, 1992, 2000, 2004, 2007, 2009
3 Free Software Foundation, Inc.
4 Written by James Clark (jjc@jclark.com)
6 This file is part of groff.
8 groff is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
29 void yyerror(const char *);
32 static const char *format_serial(char c, int n);
39 label_info(const string &);
42 label_info *lookup_label(const string &label);
46 // Does the tentative label depend on the reference?
47 CONTAINS_VARIABLE = 01,
52 virtual ~expression() { }
53 virtual void evaluate(int, const reference &, string &,
54 substring_position &) = 0;
55 virtual unsigned analyze() { return 0; }
58 class at_expr : public expression {
61 void evaluate(int, const reference &, string &, substring_position &);
62 unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
65 class format_expr : public expression {
70 format_expr(char c, int w = 0, int f = 1)
71 : type(c), width(w), first_number(f) { }
72 void evaluate(int, const reference &, string &, substring_position &);
73 unsigned analyze() { return CONTAINS_FORMAT; }
76 class field_expr : public expression {
80 field_expr(char nm, int num) : number(num), name(nm) { }
81 void evaluate(int, const reference &, string &, substring_position &);
82 unsigned analyze() { return CONTAINS_VARIABLE; }
85 class literal_expr : public expression {
88 literal_expr(const char *ptr, int len) : s(ptr, len) { }
89 void evaluate(int, const reference &, string &, substring_position &);
92 class unary_expr : public expression {
96 unary_expr(expression *e) : expr(e) { }
97 ~unary_expr() { delete expr; }
98 void evaluate(int, const reference &, string &, substring_position &) = 0;
99 unsigned analyze() { return expr ? expr->analyze() : 0; }
102 // This caches the analysis of an expression.
104 class analyzed_expr : public unary_expr {
107 analyzed_expr(expression *);
108 void evaluate(int, const reference &, string &, substring_position &);
109 unsigned analyze() { return flags; }
112 class star_expr : public unary_expr {
114 star_expr(expression *e) : unary_expr(e) { }
115 void evaluate(int, const reference &, string &, substring_position &);
117 return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
122 typedef void map_func(const char *, const char *, string &);
124 class map_expr : public unary_expr {
127 map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
128 void evaluate(int, const reference &, string &, substring_position &);
131 typedef const char *extractor_func(const char *, const char *, const char **);
133 class extractor_expr : public unary_expr {
135 extractor_func *func;
137 enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
138 extractor_expr(expression *e, extractor_func *f, int pt)
139 : unary_expr(e), part(pt), func(f) { }
140 void evaluate(int, const reference &, string &, substring_position &);
143 class truncate_expr : public unary_expr {
146 truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
147 void evaluate(int, const reference &, string &, substring_position &);
150 class separator_expr : public unary_expr {
152 separator_expr(expression *e) : unary_expr(e) { }
153 void evaluate(int, const reference &, string &, substring_position &);
156 class binary_expr : public expression {
161 binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
162 ~binary_expr() { delete expr1; delete expr2; }
163 void evaluate(int, const reference &, string &, substring_position &) = 0;
165 return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
169 class alternative_expr : public binary_expr {
171 alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
172 void evaluate(int, const reference &, string &, substring_position &);
175 class list_expr : public binary_expr {
177 list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
178 void evaluate(int, const reference &, string &, substring_position &);
181 class substitute_expr : public binary_expr {
183 substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
184 void evaluate(int, const reference &, string &, substring_position &);
187 class ternary_expr : public expression {
193 ternary_expr(expression *e1, expression *e2, expression *e3)
194 : expr1(e1), expr2(e2), expr3(e3) { }
195 ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
196 void evaluate(int, const reference &, string &, substring_position &) = 0;
198 return ((expr1 ? expr1->analyze() : 0)
199 | (expr2 ? expr2->analyze() : 0)
200 | (expr3 ? expr3->analyze() : 0));
204 class conditional_expr : public ternary_expr {
206 conditional_expr(expression *e1, expression *e2, expression *e3)
207 : ternary_expr(e1, e2, e3) { }
208 void evaluate(int, const reference &, string &, substring_position &);
211 static expression *parsed_label = 0;
212 static expression *parsed_date_label = 0;
213 static expression *parsed_short_label = 0;
215 static expression *parse_result;
224 struct { int ndigits; int val; } dig;
225 struct { int start; int len; } str;
228 /* uppercase or lowercase letter */
229 %token <num> TOKEN_LETTER
230 /* literal characters */
231 %token <str> TOKEN_LITERAL
233 %token <num> TOKEN_DIGIT
235 %type <expr> conditional
236 %type <expr> alternative
239 %type <expr> substitute
240 %type <expr> optional_conditional
243 %type <num> optional_number
250 { parse_result = ($1 ? new analyzed_expr($1) : 0); }
256 | alternative '?' optional_conditional ':' conditional
257 { $$ = new conditional_expr($1, $3, $5); }
260 optional_conditional:
270 | alternative '|' list
271 { $$ = new alternative_expr($1, $3); }
272 | alternative '&' list
273 { $$ = new conditional_expr($1, $3, 0); }
280 { $$ = new list_expr($1, $2); }
286 | substitute '~' string
287 { $$ = new substitute_expr($1, $3); }
292 { $$ = new at_expr; }
295 $$ = new literal_expr(literals.contents() + $1.start,
299 { $$ = new field_expr($1, 0); }
300 | TOKEN_LETTER number
301 { $$ = new field_expr($1, $2 - 1); }
309 $$ = new format_expr($2);
312 command_error("unrecognized format `%1'", char($2));
313 $$ = new format_expr('a');
320 $$ = new format_expr('0', $2.ndigits, $2.val);
322 | string '.' flag TOKEN_LETTER optional_number
326 $$ = new map_expr($1, lowercase);
329 $$ = new map_expr($1, uppercase);
332 $$ = new map_expr($1, capitalize);
335 $$ = new map_expr($1, reverse_name);
338 $$ = new map_expr($1, abbreviate_name);
341 $$ = new extractor_expr($1, find_year, $3);
344 $$ = new extractor_expr($1, find_last_name, $3);
348 command_error("unknown function `%1'", char($4));
354 { $$ = new truncate_expr($1, $3); }
356 { $$ = new truncate_expr($1, -$3); }
358 { $$ = new star_expr($1); }
359 | '(' optional_conditional ')'
361 | '<' optional_conditional '>'
362 { $$ = new separator_expr($2); }
381 { $$.ndigits = 1; $$.val = $1; }
383 { $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
398 /* bison defines const to be empty unless __STDC__ is defined, which it
399 isn't under cfront */
405 const char *spec_ptr;
406 const char *spec_end;
407 const char *spec_cur;
409 static char uppercase_array[] = {
410 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
411 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
412 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
416 static char lowercase_array[] = {
417 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
418 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
419 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
425 while (spec_ptr < spec_end && csspace(*spec_ptr))
428 if (spec_ptr >= spec_end)
430 unsigned char c = *spec_ptr++;
436 yylval.num = c - '0';
440 yylval.str.start = literals.length();
441 for (; spec_ptr < spec_end; spec_ptr++) {
442 if (*spec_ptr == '\'') {
443 if (++spec_ptr < spec_end && *spec_ptr == '\'')
446 yylval.str.len = literals.length() - yylval.str.start;
447 return TOKEN_LITERAL;
451 literals += *spec_ptr;
453 yylval.str.len = literals.length() - yylval.str.start;
454 return TOKEN_LITERAL;
459 int set_label_spec(const char *label_spec)
461 spec_cur = spec_ptr = label_spec;
462 spec_end = strchr(label_spec, '\0');
467 parsed_label = parse_result;
471 int set_date_label_spec(const char *label_spec)
473 spec_cur = spec_ptr = label_spec;
474 spec_end = strchr(label_spec, '\0');
478 delete parsed_date_label;
479 parsed_date_label = parse_result;
483 int set_short_label_spec(const char *label_spec)
485 spec_cur = spec_ptr = label_spec;
486 spec_end = strchr(label_spec, '\0');
490 delete parsed_short_label;
491 parsed_short_label = parse_result;
495 void yyerror(const char *message)
497 if (spec_cur < spec_end)
498 command_error("label specification %1 before `%2'", message, spec_cur);
500 command_error("label specification %1 at end of string",
504 void at_expr::evaluate(int tentative, const reference &ref,
505 string &result, substring_position &)
508 ref.canonicalize_authors(result);
510 const char *end, *start = ref.get_authors(&end);
512 result.append(start, end - start);
516 void format_expr::evaluate(int tentative, const reference &ref,
517 string &result, substring_position &)
521 const label_info *lp = ref.get_label_ptr();
522 int num = lp == 0 ? ref.get_number() : lp->count;
524 result += format_serial(type, num + 1);
526 const char *ptr = i_to_a(num + first_number);
527 int pad = width - strlen(ptr);
534 static const char *format_serial(char c, int n)
537 static char buf[128]; // more than enough.
543 // troff uses z and w to represent 10000 and 5000 in Roman
544 // numerals; I can find no historical basis for this usage
545 const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
552 for (int i = 1000; i > 0; i /= 10, s += 2) {
599 // this is derived from troff/reg.c
606 *p++ = c == 'a' ? lowercase_array[d - 1] :
607 uppercase_array[d - 1];
627 void field_expr::evaluate(int, const reference &ref,
628 string &result, substring_position &)
631 const char *start = ref.get_field(name, &end);
633 start = nth_field(number, start, &end);
635 result.append(start, end - start);
639 void literal_expr::evaluate(int, const reference &,
640 string &result, substring_position &)
645 analyzed_expr::analyzed_expr(expression *e)
646 : unary_expr(e), flags(e ? e->analyze() : 0)
650 void analyzed_expr::evaluate(int tentative, const reference &ref,
651 string &result, substring_position &pos)
654 expr->evaluate(tentative, ref, result, pos);
657 void star_expr::evaluate(int tentative, const reference &ref,
658 string &result, substring_position &pos)
660 const label_info *lp = ref.get_label_ptr();
662 && (lp == 0 || lp->total > 1)
664 expr->evaluate(tentative, ref, result, pos);
667 void separator_expr::evaluate(int tentative, const reference &ref,
668 string &result, substring_position &pos)
670 int start_length = result.length();
671 int is_first = pos.start < 0;
673 expr->evaluate(tentative, ref, result, pos);
675 pos.start = start_length;
676 pos.length = result.length() - start_length;
680 void map_expr::evaluate(int tentative, const reference &ref,
681 string &result, substring_position &)
685 substring_position temp_pos;
686 expr->evaluate(tentative, ref, temp, temp_pos);
687 (*func)(temp.contents(), temp.contents() + temp.length(), result);
691 void extractor_expr::evaluate(int tentative, const reference &ref,
692 string &result, substring_position &)
696 substring_position temp_pos;
697 expr->evaluate(tentative, ref, temp, temp_pos);
698 const char *end, *start = (*func)(temp.contents(),
699 temp.contents() + temp.length(),
704 result.append(temp.contents(), start - temp.contents());
710 result.append(start, end - start);
714 result.append(end, temp.contents() + temp.length() - end);
722 static void first_part(int len, const char *ptr, const char *end,
726 const char *token_start = ptr;
727 if (!get_token(&ptr, end))
729 const token_info *ti = lookup_token(token_start, ptr);
730 int counts = ti->sortify_non_empty(token_start, ptr);
731 if (counts && --len < 0)
733 if (counts || ti->is_accent())
734 result.append(token_start, ptr - token_start);
738 static void last_part(int len, const char *ptr, const char *end,
741 const char *start = ptr;
744 const char *token_start = ptr;
745 if (!get_token(&ptr, end))
747 const token_info *ti = lookup_token(token_start, ptr);
748 if (ti->sortify_non_empty(token_start, ptr))
752 int skip = count - len;
755 const char *token_start = ptr;
756 if (!get_token(&ptr, end))
758 const token_info *ti = lookup_token(token_start, ptr);
759 if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
765 first_part(len, ptr, end, result);
768 void truncate_expr::evaluate(int tentative, const reference &ref,
769 string &result, substring_position &)
773 substring_position temp_pos;
774 expr->evaluate(tentative, ref, temp, temp_pos);
775 const char *start = temp.contents();
776 const char *end = start + temp.length();
778 first_part(n, start, end, result);
780 last_part(-n, start, end, result);
784 void alternative_expr::evaluate(int tentative, const reference &ref,
785 string &result, substring_position &pos)
787 int start_length = result.length();
789 expr1->evaluate(tentative, ref, result, pos);
790 if (result.length() == start_length && expr2)
791 expr2->evaluate(tentative, ref, result, pos);
794 void list_expr::evaluate(int tentative, const reference &ref,
795 string &result, substring_position &pos)
798 expr1->evaluate(tentative, ref, result, pos);
800 expr2->evaluate(tentative, ref, result, pos);
803 void substitute_expr::evaluate(int tentative, const reference &ref,
804 string &result, substring_position &pos)
806 int start_length = result.length();
808 expr1->evaluate(tentative, ref, result, pos);
809 if (result.length() > start_length && result[result.length() - 1] == '-') {
810 // ought to see if pos covers the -
811 result.set_length(result.length() - 1);
813 expr2->evaluate(tentative, ref, result, pos);
817 void conditional_expr::evaluate(int tentative, const reference &ref,
818 string &result, substring_position &pos)
821 substring_position temp_pos;
823 expr1->evaluate(tentative, ref, temp, temp_pos);
824 if (temp.length() > 0) {
826 expr2->evaluate(tentative, ref, result, pos);
830 expr3->evaluate(tentative, ref, result, pos);
834 void reference::pre_compute_label()
836 if (parsed_label != 0
837 && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
839 substring_position temp_pos;
840 parsed_label->evaluate(1, *this, label, temp_pos);
841 label_ptr = lookup_label(label);
845 void reference::compute_label()
849 parsed_label->evaluate(0, *this, label, separator_pos);
850 if (short_label_flag && parsed_short_label)
851 parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
854 if (parsed_date_label) {
855 substring_position temp_pos;
856 parsed_date_label->evaluate(0, *this, new_date, temp_pos);
861 label_ptr->count += 1;
864 void reference::immediate_compute_label()
867 label_ptr->total = 2; // force use of disambiguator
871 int reference::merge_labels(reference **v, int n, label_type type,
874 if (abbreviate_label_ranges)
875 return merge_labels_by_number(v, n, type, result);
877 return merge_labels_by_parts(v, n, type, result);
880 int reference::merge_labels_by_number(reference **v, int n, label_type type,
885 int num = get_number();
886 // Only merge three or more labels.
887 if (v[0]->get_number() != num + 1
888 || v[1]->get_number() != num + 2)
891 for (i = 2; i < n; i++)
892 if (v[i]->get_number() != num + i + 1)
894 result = get_label(type);
895 result += label_range_indicator;
896 result += v[i - 1]->get_label(type);
900 const substring_position &reference::get_separator_pos(label_type type) const
902 if (type == SHORT_LABEL && short_label_flag)
903 return short_separator_pos;
905 return separator_pos;
908 const string &reference::get_label(label_type type) const
910 if (type == SHORT_LABEL && short_label_flag)
916 int reference::merge_labels_by_parts(reference **v, int n, label_type type,
921 const string &lb = get_label(type);
922 const substring_position &sp = get_separator_pos(type);
924 || sp.start != v[0]->get_separator_pos(type).start
925 || memcmp(lb.contents(), v[0]->get_label(type).contents(),
931 result += separate_label_second_parts;
932 const substring_position &s = v[i]->get_separator_pos(type);
933 int sep_end_pos = s.start + s.length;
934 result.append(v[i]->get_label(type).contents() + sep_end_pos,
935 v[i]->get_label(type).length() - sep_end_pos);
937 && sp.start == v[i]->get_separator_pos(type).start
938 && memcmp(lb.contents(), v[i]->get_label(type).contents(),
945 label_info::label_info(const string &s)
946 : start(label_pool.length()), length(s.length()), count(0), total(1)
951 static label_info **label_table = 0;
952 static int label_table_size = 0;
953 static int label_table_used = 0;
955 label_info *lookup_label(const string &label)
957 if (label_table == 0) {
958 label_table = new label_info *[17];
959 label_table_size = 17;
960 for (int i = 0; i < 17; i++)
963 unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
965 for (ptr = label_table + h;
968 ? (ptr = label_table + label_table_size - 1)
970 if ((*ptr)->length == label.length()
971 && memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
972 label.length()) == 0) {
976 label_info *result = *ptr = new label_info(label);
977 if (++label_table_used * 2 > label_table_size) {
979 label_info **old_table = label_table;
980 int old_size = label_table_size;
981 label_table_size = next_size(label_table_size);
982 label_table = new label_info *[label_table_size];
984 for (i = 0; i < label_table_size; i++)
986 for (i = 0; i < old_size; i++)
988 h = hash_string(label_pool.contents() + old_table[i]->start,
989 old_table[i]->length);
991 for (p = label_table + (h % label_table_size);
994 ? (p = label_table + label_table_size - 1)
1006 for (int i = 0; i < label_table_size; i++) {
1007 delete label_table[i];
1010 label_table_used = 0;
1014 static void consider_authors(reference **start, reference **end, int i);
1016 void compute_labels(reference **v, int n)
1019 && (parsed_label->analyze() & expression::CONTAINS_AT)
1020 && sort_fields.length() >= 2
1021 && sort_fields[0] == 'A'
1022 && sort_fields[1] == '+')
1023 consider_authors(v, v + n, 0);
1024 for (int i = 0; i < n; i++)
1025 v[i]->compute_label();
1029 /* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1030 where 0 <= i <= N if there exists a reference with a list of authors
1031 <B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1032 and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
1033 A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
1034 <B0,B1,...,BM>. If a reference needs author i we only have to call
1035 need_author(j) for some j >= i such that the reference also needs
1038 /* This function handles 2 tasks:
1039 determine which authors are needed (cannot be elided with et al.);
1040 determine which authors can have only last names in the labels.
1042 References >= start and < end have the same first i author names.
1043 Also they're sorted by A+. */
1045 static void consider_authors(reference **start, reference **end, int i)
1049 reference **p = start;
1050 if (i >= (*p)->get_nauthors()) {
1051 for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1053 if (p < end && i > 0) {
1054 // If we have an author list <A B C> and an author list <A B C D>,
1055 // then both lists need C.
1056 for (reference **q = start; q < end; q++)
1057 (*q)->need_author(i - 1);
1062 reference **last_name_start = p;
1063 reference **name_start = p;
1065 p < end && i < (*p)->get_nauthors()
1066 && same_author_last_name(**last_name_start, **p, i);
1068 if (!same_author_name(**name_start, **p, i)) {
1069 consider_authors(name_start, p, i + 1);
1073 consider_authors(name_start, p, i + 1);
1074 if (last_name_start == name_start) {
1075 for (reference **q = last_name_start; q < p; q++)
1076 (*q)->set_last_name_unambiguous(i);
1078 // If we have an author list <A B C D> and <A B C E>, then the lists
1079 // need author D and E respectively.
1080 if (name_start > start || p < end) {
1081 for (reference **q = last_name_start; q < p; q++)
1082 (*q)->need_author(i);
1087 int same_author_last_name(const reference &r1, const reference &r2, int n)
1090 const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1092 const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1093 if (!as1 && !as2) return 1; // they are the same
1094 if (!as1 || !as2) return 0;
1095 return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1098 int same_author_name(const reference &r1, const reference &r2, int n)
1101 const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1103 const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1104 if (!as1 && !as2) return 1; // they are the same
1105 if (!as1 || !as2) return 0;
1106 return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1110 void int_set::set(int i)
1114 if (bytei >= v.length()) {
1115 int old_length = v.length();
1116 v.set_length(bytei + 1);
1117 for (int j = old_length; j <= bytei; j++)
1120 v[bytei] |= 1 << (i & 7);
1123 int int_set::get(int i) const
1127 return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1130 void reference::set_last_name_unambiguous(int i)
1132 last_name_unambiguous.set(i);
1135 void reference::need_author(int n)
1137 if (n > last_needed_author)
1138 last_needed_author = n;
1141 const char *reference::get_authors(const char **end) const
1143 if (!computed_authors) {
1144 ((reference *)this)->computed_authors = 1;
1145 string &result = ((reference *)this)->authors;
1146 int na = get_nauthors();
1148 for (int i = 0; i < na; i++) {
1149 if (last_name_unambiguous.get(i)) {
1150 const char *e, *start = get_author_last_name(i, &e);
1152 result.append(start, e - start);
1155 const char *e, *start = get_author(i, &e);
1157 result.append(start, e - start);
1159 if (i == last_needed_author
1160 && et_al.length() > 0
1161 && et_al_min_elide > 0
1162 && last_needed_author + et_al_min_elide < na
1163 && na >= et_al_min_total) {
1169 result += join_authors_exactly_two;
1170 else if (i < na - 2)
1171 result += join_authors_default;
1173 result += join_authors_last_two;
1177 const char *start = authors.contents();
1178 *end = start + authors.length();
1182 int reference::get_nauthors() const
1187 for (na = 0; get_author(na, &dummy) != 0; na++)
1189 ((reference *)this)->nauthors = na;