Import mdocml-1.10.9
[dragonfly.git] / contrib / mdocml / html.c
1 /*      $Id: html.c,v 1.124 2010/12/27 21:41:05 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20
21 #include <sys/types.h>
22
23 #include <assert.h>
24 #include <ctype.h>
25 #include <stdarg.h>
26 #include <stdio.h>
27 #include <stdint.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31
32 #include "mandoc.h"
33 #include "out.h"
34 #include "chars.h"
35 #include "html.h"
36 #include "main.h"
37
38 struct  htmldata {
39         const char       *name;
40         int               flags;
41 #define HTML_CLRLINE     (1 << 0)
42 #define HTML_NOSTACK     (1 << 1)
43 #define HTML_AUTOCLOSE   (1 << 2) /* Tag has auto-closure. */
44 };
45
46 static  const struct htmldata htmltags[TAG_MAX] = {
47         {"html",        HTML_CLRLINE}, /* TAG_HTML */
48         {"head",        HTML_CLRLINE}, /* TAG_HEAD */
49         {"body",        HTML_CLRLINE}, /* TAG_BODY */
50         {"meta",        HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */
51         {"title",       HTML_CLRLINE}, /* TAG_TITLE */
52         {"div",         HTML_CLRLINE}, /* TAG_DIV */
53         {"h1",          0}, /* TAG_H1 */
54         {"h2",          0}, /* TAG_H2 */
55         {"span",        0}, /* TAG_SPAN */
56         {"link",        HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */
57         {"br",          HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */
58         {"a",           0}, /* TAG_A */
59         {"table",       HTML_CLRLINE}, /* TAG_TABLE */
60         {"tbody",       HTML_CLRLINE}, /* TAG_TBODY */
61         {"col",         HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */
62         {"tr",          HTML_CLRLINE}, /* TAG_TR */
63         {"td",          HTML_CLRLINE}, /* TAG_TD */
64         {"li",          HTML_CLRLINE}, /* TAG_LI */
65         {"ul",          HTML_CLRLINE}, /* TAG_UL */
66         {"ol",          HTML_CLRLINE}, /* TAG_OL */
67         {"dl",          HTML_CLRLINE}, /* TAG_DL */
68         {"dt",          HTML_CLRLINE}, /* TAG_DT */
69         {"dd",          HTML_CLRLINE}, /* TAG_DD */
70         {"blockquote",  HTML_CLRLINE}, /* TAG_BLOCKQUOTE */
71         {"p",           HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */
72         {"pre",         HTML_CLRLINE }, /* TAG_PRE */
73         {"b",           0 }, /* TAG_B */
74         {"i",           0 }, /* TAG_I */
75         {"code",        0 }, /* TAG_CODE */
76         {"small",       0 }, /* TAG_SMALL */
77 };
78
79 static  const char      *const htmlattrs[ATTR_MAX] = {
80         "http-equiv", /* ATTR_HTTPEQUIV */
81         "content", /* ATTR_CONTENT */
82         "name", /* ATTR_NAME */
83         "rel", /* ATTR_REL */
84         "href", /* ATTR_HREF */
85         "type", /* ATTR_TYPE */
86         "media", /* ATTR_MEDIA */
87         "class", /* ATTR_CLASS */
88         "style", /* ATTR_STYLE */
89         "width", /* ATTR_WIDTH */
90         "id", /* ATTR_ID */
91         "summary", /* ATTR_SUMMARY */
92         "align", /* ATTR_ALIGN */
93 };
94
95 static  void              print_spec(struct html *, enum roffdeco,
96                                 const char *, size_t);
97 static  void              print_res(struct html *, const char *, size_t);
98 static  void              print_ctag(struct html *, enum htmltag);
99 static  void              print_doctype(struct html *);
100 static  void              print_xmltype(struct html *);
101 static  int               print_encode(struct html *, const char *, int);
102 static  void              print_metaf(struct html *, enum roffdeco);
103 static  void              print_attr(struct html *, 
104                                 const char *, const char *);
105 static  void             *ml_alloc(char *, enum htmltype);
106
107
108 static void *
109 ml_alloc(char *outopts, enum htmltype type)
110 {
111         struct html     *h;
112         const char      *toks[4];
113         char            *v;
114
115         toks[0] = "style";
116         toks[1] = "man";
117         toks[2] = "includes";
118         toks[3] = NULL;
119
120         h = calloc(1, sizeof(struct html));
121         if (NULL == h) {
122                 perror(NULL);
123                 exit((int)MANDOCLEVEL_SYSERR);
124         }
125
126         h->type = type;
127         h->tags.head = NULL;
128         h->symtab = chars_init(CHARS_HTML);
129
130         while (outopts && *outopts)
131                 switch (getsubopt(&outopts, UNCONST(toks), &v)) {
132                 case (0):
133                         h->style = v;
134                         break;
135                 case (1):
136                         h->base_man = v;
137                         break;
138                 case (2):
139                         h->base_includes = v;
140                         break;
141                 default:
142                         break;
143                 }
144
145         return(h);
146 }
147
148 void *
149 html_alloc(char *outopts)
150 {
151
152         return(ml_alloc(outopts, HTML_HTML_4_01_STRICT));
153 }
154
155
156 void *
157 xhtml_alloc(char *outopts)
158 {
159
160         return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT));
161 }
162
163
164 void
165 html_free(void *p)
166 {
167         struct tag      *tag;
168         struct html     *h;
169
170         h = (struct html *)p;
171
172         while ((tag = h->tags.head) != NULL) {
173                 h->tags.head = tag->next;       
174                 free(tag);
175         }
176         
177         if (h->symtab)
178                 chars_free(h->symtab);
179
180         free(h);
181 }
182
183
184 void
185 print_gen_head(struct html *h)
186 {
187         struct htmlpair  tag[4];
188
189         tag[0].key = ATTR_HTTPEQUIV;
190         tag[0].val = "Content-Type";
191         tag[1].key = ATTR_CONTENT;
192         tag[1].val = "text/html; charset=utf-8";
193         print_otag(h, TAG_META, 2, tag);
194
195         tag[0].key = ATTR_NAME;
196         tag[0].val = "resource-type";
197         tag[1].key = ATTR_CONTENT;
198         tag[1].val = "document";
199         print_otag(h, TAG_META, 2, tag);
200
201         if (h->style) {
202                 tag[0].key = ATTR_REL;
203                 tag[0].val = "stylesheet";
204                 tag[1].key = ATTR_HREF;
205                 tag[1].val = h->style;
206                 tag[2].key = ATTR_TYPE;
207                 tag[2].val = "text/css";
208                 tag[3].key = ATTR_MEDIA;
209                 tag[3].val = "all";
210                 print_otag(h, TAG_LINK, 4, tag);
211         }
212 }
213
214
215 static void
216 print_spec(struct html *h, enum roffdeco d, const char *p, size_t len)
217 {
218         int              cp;
219         const char      *rhs;
220         size_t           sz;
221
222         if ((cp = chars_spec2cp(h->symtab, p, len)) > 0) {
223                 printf("&#%d;", cp);
224                 return;
225         } else if (-1 == cp && DECO_SSPECIAL == d) {
226                 fwrite(p, 1, len, stdout);
227                 return;
228         } else if (-1 == cp)
229                 return;
230
231         if (NULL != (rhs = chars_spec2str(h->symtab, p, len, &sz)))
232                 fwrite(rhs, 1, sz, stdout);
233 }
234
235
236 static void
237 print_res(struct html *h, const char *p, size_t len)
238 {
239         int              cp;
240         const char      *rhs;
241         size_t           sz;
242
243         if ((cp = chars_res2cp(h->symtab, p, len)) > 0) {
244                 printf("&#%d;", cp);
245                 return;
246         } else if (-1 == cp)
247                 return;
248
249         if (NULL != (rhs = chars_res2str(h->symtab, p, len, &sz)))
250                 fwrite(rhs, 1, sz, stdout);
251 }
252
253
254 static void
255 print_metaf(struct html *h, enum roffdeco deco)
256 {
257         enum htmlfont    font;
258
259         switch (deco) {
260         case (DECO_PREVIOUS):
261                 font = h->metal;
262                 break;
263         case (DECO_ITALIC):
264                 font = HTMLFONT_ITALIC;
265                 break;
266         case (DECO_BOLD):
267                 font = HTMLFONT_BOLD;
268                 break;
269         case (DECO_ROMAN):
270                 font = HTMLFONT_NONE;
271                 break;
272         default:
273                 abort();
274                 /* NOTREACHED */
275         }
276
277         if (h->metaf) {
278                 print_tagq(h, h->metaf);
279                 h->metaf = NULL;
280         }
281
282         h->metal = h->metac;
283         h->metac = font;
284
285         if (HTMLFONT_NONE != font)
286                 h->metaf = HTMLFONT_BOLD == font ?
287                         print_otag(h, TAG_B, 0, NULL) :
288                         print_otag(h, TAG_I, 0, NULL);
289 }
290
291
292 static int
293 print_encode(struct html *h, const char *p, int norecurse)
294 {
295         size_t           sz;
296         int              len, nospace;
297         const char      *seq;
298         enum roffdeco    deco;
299         static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' };
300
301         nospace = 0;
302
303         for (; *p; p++) {
304                 sz = strcspn(p, rejs);
305
306                 fwrite(p, 1, sz, stdout);
307                 p += /* LINTED */
308                         sz;
309
310                 if ('<' == *p) {
311                         printf("&lt;");
312                         continue;
313                 } else if ('>' == *p) {
314                         printf("&gt;");
315                         continue;
316                 } else if ('&' == *p) {
317                         printf("&amp;");
318                         continue;
319                 } else if (ASCII_HYPH == *p) {
320                         /*
321                          * Note: "soft hyphens" aren't graphically
322                          * displayed when not breaking the text; we want
323                          * them to be displayed.
324                          */
325                         /*printf("&#173;");*/
326                         putchar('-');
327                         continue;
328                 } else if ('\0' == *p)
329                         break;
330
331                 seq = ++p;
332                 len = a2roffdeco(&deco, &seq, &sz);
333
334                 switch (deco) {
335                 case (DECO_RESERVED):
336                         print_res(h, seq, sz);
337                         break;
338                 case (DECO_SSPECIAL):
339                         /* FALLTHROUGH */
340                 case (DECO_SPECIAL):
341                         print_spec(h, deco, seq, sz);
342                         break;
343                 case (DECO_PREVIOUS):
344                         /* FALLTHROUGH */
345                 case (DECO_BOLD):
346                         /* FALLTHROUGH */
347                 case (DECO_ITALIC):
348                         /* FALLTHROUGH */
349                 case (DECO_ROMAN):
350                         if (norecurse)
351                                 break;
352                         print_metaf(h, deco);
353                         break;
354                 default:
355                         break;
356                 }
357
358                 p += len - 1;
359
360                 if (DECO_NOSPACE == deco && '\0' == *(p + 1))
361                         nospace = 1;
362         }
363
364         return(nospace);
365 }
366
367
368 static void
369 print_attr(struct html *h, const char *key, const char *val)
370 {
371         printf(" %s=\"", key);
372         (void)print_encode(h, val, 1);
373         putchar('\"');
374 }
375
376
377 struct tag *
378 print_otag(struct html *h, enum htmltag tag, 
379                 int sz, const struct htmlpair *p)
380 {
381         int              i;
382         struct tag      *t;
383
384         /* Push this tags onto the stack of open scopes. */
385
386         if ( ! (HTML_NOSTACK & htmltags[tag].flags)) {
387                 t = malloc(sizeof(struct tag));
388                 if (NULL == t) {
389                         perror(NULL);
390                         exit((int)MANDOCLEVEL_SYSERR);
391                 }
392                 t->tag = tag;
393                 t->next = h->tags.head;
394                 h->tags.head = t;
395         } else
396                 t = NULL;
397
398         if ( ! (HTML_NOSPACE & h->flags))
399                 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) {
400                         /* Manage keeps! */
401                         if ( ! (HTML_KEEP & h->flags)) {
402                                 if (HTML_PREKEEP & h->flags)
403                                         h->flags |= HTML_KEEP;
404                                 putchar(' ');
405                         } else
406                                 printf("&#160;");
407                 }
408
409         if ( ! (h->flags & HTML_NONOSPACE))
410                 h->flags &= ~HTML_NOSPACE;
411         else
412                 h->flags |= HTML_NOSPACE;
413
414         /* Print out the tag name and attributes. */
415
416         printf("<%s", htmltags[tag].name);
417         for (i = 0; i < sz; i++)
418                 print_attr(h, htmlattrs[p[i].key], p[i].val);
419
420         /* Add non-overridable attributes. */
421
422         if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) {
423                 print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml");
424                 print_attr(h, "xml:lang", "en");
425                 print_attr(h, "lang", "en");
426         }
427
428         /* Accomodate for XML "well-formed" singleton escaping. */
429
430         if (HTML_AUTOCLOSE & htmltags[tag].flags)
431                 switch (h->type) {
432                 case (HTML_XHTML_1_0_STRICT):
433                         putchar('/');
434                         break;
435                 default:
436                         break;
437                 }
438
439         putchar('>');
440
441         h->flags |= HTML_NOSPACE;
442
443         if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags)
444                 putchar('\n');
445
446         return(t);
447 }
448
449
450 static void
451 print_ctag(struct html *h, enum htmltag tag)
452 {
453         
454         printf("</%s>", htmltags[tag].name);
455         if (HTML_CLRLINE & htmltags[tag].flags) {
456                 h->flags |= HTML_NOSPACE;
457                 putchar('\n');
458         } 
459 }
460
461
462 void
463 print_gen_decls(struct html *h)
464 {
465
466         print_xmltype(h);
467         print_doctype(h);
468 }
469
470
471 static void
472 print_xmltype(struct html *h)
473 {
474
475         if (HTML_XHTML_1_0_STRICT == h->type)
476                 puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
477 }
478
479
480 static void
481 print_doctype(struct html *h)
482 {
483         const char      *doctype;
484         const char      *dtd;
485         const char      *name;
486
487         switch (h->type) {
488         case (HTML_HTML_4_01_STRICT):
489                 name = "HTML";
490                 doctype = "-//W3C//DTD HTML 4.01//EN";
491                 dtd = "http://www.w3.org/TR/html4/strict.dtd";
492                 break;
493         default:
494                 name = "html";
495                 doctype = "-//W3C//DTD XHTML 1.0 Strict//EN";
496                 dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
497                 break;
498         }
499
500         printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n", 
501                         name, doctype, dtd);
502 }
503
504
505 void
506 print_text(struct html *h, const char *word)
507 {
508
509         if (word[0] && '\0' == word[1])
510                 switch (word[0]) {
511                 case('.'):
512                         /* FALLTHROUGH */
513                 case(','):
514                         /* FALLTHROUGH */
515                 case(';'):
516                         /* FALLTHROUGH */
517                 case(':'):
518                         /* FALLTHROUGH */
519                 case('?'):
520                         /* FALLTHROUGH */
521                 case('!'):
522                         /* FALLTHROUGH */
523                 case(')'):
524                         /* FALLTHROUGH */
525                 case(']'):
526                         if ( ! (HTML_IGNDELIM & h->flags))
527                                 h->flags |= HTML_NOSPACE;
528                         break;
529                 default:
530                         break;
531                 }
532
533         if ( ! (HTML_NOSPACE & h->flags)) {
534                 /* Manage keeps! */
535                 if ( ! (HTML_KEEP & h->flags)) {
536                         if (HTML_PREKEEP & h->flags)
537                                 h->flags |= HTML_KEEP;
538                         putchar(' ');
539                 } else
540                         printf("&#160;");
541         }
542
543         assert(NULL == h->metaf);
544         if (HTMLFONT_NONE != h->metac)
545                 h->metaf = HTMLFONT_BOLD == h->metac ?
546                         print_otag(h, TAG_B, 0, NULL) :
547                         print_otag(h, TAG_I, 0, NULL);
548
549         assert(word);
550         if ( ! print_encode(h, word, 0))
551                 if ( ! (h->flags & HTML_NONOSPACE))
552                         h->flags &= ~HTML_NOSPACE;
553
554         if (h->metaf) {
555                 print_tagq(h, h->metaf);
556                 h->metaf = NULL;
557         }
558
559         h->flags &= ~HTML_IGNDELIM;
560
561         /* 
562          * Note that we don't process the pipe: the parser sees it as
563          * punctuation, but we don't in terms of typography.
564          */
565         if (word[0] && '\0' == word[1])
566                 switch (word[0]) {
567                 case('('):
568                         /* FALLTHROUGH */
569                 case('['):
570                         h->flags |= HTML_NOSPACE;
571                         break;
572                 default:
573                         break;
574                 }
575 }
576
577
578 void
579 print_tagq(struct html *h, const struct tag *until)
580 {
581         struct tag      *tag;
582
583         while ((tag = h->tags.head) != NULL) {
584                 if (tag == h->metaf)
585                         h->metaf = NULL;
586                 print_ctag(h, tag->tag);
587                 h->tags.head = tag->next;
588                 free(tag);
589                 if (until && tag == until)
590                         return;
591         }
592 }
593
594
595 void
596 print_stagq(struct html *h, const struct tag *suntil)
597 {
598         struct tag      *tag;
599
600         while ((tag = h->tags.head) != NULL) {
601                 if (suntil && tag == suntil)
602                         return;
603                 if (tag == h->metaf)
604                         h->metaf = NULL;
605                 print_ctag(h, tag->tag);
606                 h->tags.head = tag->next;
607                 free(tag);
608         }
609 }
610
611
612 void
613 bufinit(struct html *h)
614 {
615
616         h->buf[0] = '\0';
617         h->buflen = 0;
618 }
619
620
621 void
622 bufcat_style(struct html *h, const char *key, const char *val)
623 {
624
625         bufcat(h, key);
626         bufncat(h, ":", 1);
627         bufcat(h, val);
628         bufncat(h, ";", 1);
629 }
630
631
632 void
633 bufcat(struct html *h, const char *p)
634 {
635
636         bufncat(h, p, strlen(p));
637 }
638
639
640 void
641 buffmt(struct html *h, const char *fmt, ...)
642 {
643         va_list          ap;
644
645         va_start(ap, fmt);
646         (void)vsnprintf(h->buf + (int)h->buflen, 
647                         BUFSIZ - h->buflen - 1, fmt, ap);
648         va_end(ap);
649         h->buflen = strlen(h->buf);
650 }
651
652
653 void
654 bufncat(struct html *h, const char *p, size_t sz)
655 {
656
657         if (h->buflen + sz > BUFSIZ - 1)
658                 sz = BUFSIZ - 1 - h->buflen;
659
660         (void)strncat(h->buf, p, sz);
661         h->buflen += sz;
662 }
663
664
665 void
666 buffmt_includes(struct html *h, const char *name)
667 {
668         const char      *p, *pp;
669
670         pp = h->base_includes;
671         
672         while (NULL != (p = strchr(pp, '%'))) {
673                 bufncat(h, pp, (size_t)(p - pp));
674                 switch (*(p + 1)) {
675                 case('I'):
676                         bufcat(h, name);
677                         break;
678                 default:
679                         bufncat(h, p, 2);
680                         break;
681                 }
682                 pp = p + 2;
683         }
684         if (pp)
685                 bufcat(h, pp);
686 }
687
688
689 void
690 buffmt_man(struct html *h, 
691                 const char *name, const char *sec)
692 {
693         const char      *p, *pp;
694
695         pp = h->base_man;
696         
697         /* LINTED */
698         while (NULL != (p = strchr(pp, '%'))) {
699                 bufncat(h, pp, (size_t)(p - pp));
700                 switch (*(p + 1)) {
701                 case('S'):
702                         bufcat(h, sec ? sec : "1");
703                         break;
704                 case('N'):
705                         buffmt(h, name);
706                         break;
707                 default:
708                         bufncat(h, p, 2);
709                         break;
710                 }
711                 pp = p + 2;
712         }
713         if (pp)
714                 bufcat(h, pp);
715 }
716
717
718 void
719 bufcat_su(struct html *h, const char *p, const struct roffsu *su)
720 {
721         double           v;
722         const char      *u;
723
724         v = su->scale;
725
726         switch (su->unit) {
727         case (SCALE_CM):
728                 u = "cm";
729                 break;
730         case (SCALE_IN):
731                 u = "in";
732                 break;
733         case (SCALE_PC):
734                 u = "pc";
735                 break;
736         case (SCALE_PT):
737                 u = "pt";
738                 break;
739         case (SCALE_EM):
740                 u = "em";
741                 break;
742         case (SCALE_MM):
743                 if (0 == (v /= 100))
744                         v = 1;
745                 u = "em";
746                 break;
747         case (SCALE_EN):
748                 u = "ex";
749                 break;
750         case (SCALE_BU):
751                 u = "ex";
752                 break;
753         case (SCALE_VS):
754                 u = "em";
755                 break;
756         default:
757                 u = "ex";
758                 break;
759         }
760
761         /* 
762          * XXX: the CSS spec isn't clear as to which types accept
763          * integer or real numbers, so we just make them all decimals.
764          */
765         buffmt(h, "%s: %.2f%s;", p, v, u);
766 }
767
768
769 void
770 html_idcat(char *dst, const char *src, int sz)
771 {
772         int              ssz;
773
774         assert(sz > 2);
775
776         /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */
777
778         /* We can't start with a number (bah). */
779
780         if ('#' == *dst) {
781                 dst++;
782                 sz--;
783         }
784         if ('\0' == *dst) {
785                 *dst++ = 'x';
786                 *dst = '\0';
787                 sz--;
788         }
789
790         for ( ; *dst != '\0' && sz; dst++, sz--)
791                 /* Jump to end. */ ;
792
793         for ( ; *src != '\0' && sz > 1; src++) {
794                 ssz = snprintf(dst, (size_t)sz, "%.2x", *src);
795                 sz -= ssz;
796                 dst += ssz;
797         }
798 }