gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* $Id: preconv.c,v 1.17 2018/12/13 11:55:47 schwarze Exp $ */
	2	/*
	3	* Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
	4	* Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
	5	*
	6	* Permission to use, copy, modify, and distribute this software for any
	7	* purpose with or without fee is hereby granted, provided that the above
	8	* copyright notice and this permission notice appear in all copies.
	9	*
	10	* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	11	* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	12	* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
	13	* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	14	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
	15	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
	16	* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
	17	*/
	18	#include "config.h"
	19
	20	#include <sys/types.h>
	21
	22	#include <assert.h>
	23	#include <stdio.h>
	24	#include <string.h>
	25
	26	#include "mandoc.h"
	27	#include "roff.h"
	28	#include "mandoc_parse.h"
	29	#include "libmandoc.h"
	30
	31	int
	32	preconv_encode(const struct buf ib, size_t ii, struct buf ob, size_t oi,
	33	int *filenc)
	34	{
	35	const unsigned char *cu;
	36	int nby;
	37	unsigned int accum;
	38
	39	cu = (const unsigned char )ib->buf + ii;
	40	assert(*cu & 0x80);
	41
	42	if ( ! (*filenc & MPARSE_UTF8))
	43	goto latin;
	44
	45	nby = 1;
	46	while (nby < 5 && *cu & (1 << (7 - nby)))
	47	nby++;
	48
	49	switch (nby) {
	50	case 2:
	51	accum = *cu & 0x1f;
	52	if (accum < 0x02) /* Obfuscated ASCII. */
	53	goto latin;
	54	break;
	55	case 3:
	56	accum = *cu & 0x0f;
	57	break;
	58	case 4:
	59	accum = *cu & 0x07;
	60	if (accum > 0x04) /* Beyond Unicode. */
	61	goto latin;
	62	break;
	63	default: /* Bad sequence header. */
	64	goto latin;
	65	}
	66
	67	cu++;
	68	switch (nby) {
	69	case 3:
	70	if ((accum == 0x00 && ! (cu & 0x20)) \|\| / Use 2-byte. */
	71	(accum == 0x0d && cu & 0x20)) / Surrogates. */
	72	goto latin;
	73	break;
	74	case 4:
	75	if ((accum == 0x00 && ! (cu & 0x30)) \|\| / Use 3-byte. */
	76	(accum == 0x04 && cu & 0x30)) / Beyond Unicode. */
	77	goto latin;
	78	break;
	79	default:
	80	break;
	81	}
	82
	83	while (--nby) {
	84	if ((cu & 0xc0) != 0x80) / Invalid continuation. */
	85	goto latin;
	86	accum <<= 6;
	87	accum += *cu & 0x3f;
	88	cu++;
	89	}
	90
	91	assert(accum > 0x7f);
	92	assert(accum < 0x110000);
	93	assert(accum < 0xd800 \|\| accum > 0xdfff);
	94
	95	oi += snprintf(ob->buf + oi, 11, "\\[u%.4X]", accum);
	96	ii = (const char )cu - ib->buf;
	97	*filenc &= ~MPARSE_LATIN1;
	98	return 1;
	99
	100	latin:
	101	if ( ! (*filenc & MPARSE_LATIN1))
	102	return 0;
	103
	104	oi += snprintf(ob->buf + oi, 11,
	105	"\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]);
	106
	107	*filenc &= ~MPARSE_UTF8;
	108	return 1;
	109	}
	110
	111	int
	112	preconv_cue(const struct buf *b, size_t offset)
	113	{
	114	const char ln, eoln, *eoph;
	115	size_t sz, phsz;
	116
	117	ln = b->buf + offset;
	118	sz = b->sz - offset;
	119
	120	/* Look for the end-of-line. */
	121
	122	if (NULL == (eoln = memchr(ln, '\n', sz)))
	123	eoln = ln + sz;
	124
	125	/* Check if we have the correct header/trailer. */
	126
	127	if ((sz = (size_t)(eoln - ln)) < 10 \|\|
	128	memcmp(ln, ".\\\" --", 7) \|\| memcmp(eoln - 3, "--", 3))
	129	return MPARSE_UTF8 \| MPARSE_LATIN1;
	130
	131	/* Move after the header and adjust for the trailer. */
	132
	133	ln += 7;
	134	sz -= 10;
	135
	136	while (sz > 0) {
	137	while (sz > 0 && ' ' == *ln) {
	138	ln++;
	139	sz--;
	140	}
	141	if (0 == sz)
	142	break;
	143
	144	/* Find the end-of-phrase marker (or eoln). */
	145
	146	if (NULL == (eoph = memchr(ln, ';', sz)))
	147	eoph = eoln - 3;
	148	else
	149	eoph++;
	150
	151	/* Only account for the "coding" phrase. */
	152
	153	if ((phsz = eoph - ln) < 7 \|\|
	154	strncasecmp(ln, "coding:", 7)) {
	155	sz -= phsz;
	156	ln += phsz;
	157	continue;
	158	}
	159
	160	sz -= 7;
	161	ln += 7;
	162
	163	while (sz > 0 && ' ' == *ln) {
	164	ln++;
	165	sz--;
	166	}
	167	if (0 == sz)
	168	return 0;
	169
	170	/* Check us against known encodings. */
	171
	172	if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
	173	return MPARSE_UTF8;
	174	if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
	175	return MPARSE_LATIN1;
	176	return 0;
	177	}
	178	return MPARSE_UTF8 \| MPARSE_LATIN1;
	179	}