Initial import of binutils 2.22 on the new vendor branch
[dragonfly.git] / contrib / groff / src / preproc / preconv / preconv.cpp
1 // -*- C++ -*-
2 /* Copyright (C) 2005, 2006, 2008, 2009
3    Free Software Foundation, Inc.
4      Written by Werner Lemberg (wl@gnu.org)
5
6 This file is part of groff.
7
8 groff is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20
21 #include "lib.h"
22
23 #include <assert.h>
24 #include <stdlib.h>
25 #include <errno.h>
26 #include "errarg.h"
27 #include "error.h"
28 #include "localcharset.h"
29 #include "nonposix.h"
30 #include "stringclass.h"
31
32 #include <locale.h>
33
34 #if HAVE_ICONV
35 # include <iconv.h>
36 # ifdef WORDS_BIGENDIAN
37 #  define UNICODE "UTF-32BE"
38 # else
39 #  define UNICODE "UTF-32LE"
40 # endif
41 #endif
42
43 #define MAX_VAR_LEN 100
44
45 extern "C" const char *Version_string;
46
47 char default_encoding[MAX_VAR_LEN];
48 char user_encoding[MAX_VAR_LEN];
49 char encoding_string[MAX_VAR_LEN];
50 int debug_flag = 0;
51 int raw_flag = 0;
52
53 struct conversion {
54   const char *from;
55   const char *to;
56 };
57
58 // The official list of MIME tags can be found at
59 //
60 //   http://www.iana.org/assignments/character-sets
61 //
62 // For encodings which don't have a MIME tag we use GNU iconv's encoding
63 // names (which also work with the portable GNU libiconv package).  They
64 // are marked with `*'.
65 //
66 // Encodings specific to XEmacs and Emacs are marked as such; no mark means
67 // that they are used by both Emacs and XEmacs.
68 //
69 // Encodings marked with `--' are special to Emacs, XEmacs, or other
70 // applications and shouldn't be used for data exchange.
71 //
72 // `Not covered' means that the encoding can be handled neither by GNU iconv
73 // nor by libiconv, or just one of them has support for it.
74 //
75 // A special case is VIQR encoding: Despite of having a MIME tag it is
76 // missing in both libiconv 1.10 and iconv (coming with GNU libc 2.3.6).
77 //
78 // Finally, we add all aliases of GNU iconv for `ascii', `latin1', and
79 // `utf8' to catch those encoding names before iconv is called.
80 //
81 // Note that most entries are commented out -- only a small, (rather)
82 // reliable and stable subset of encodings is recognized (for coding tags)
83 // which are still in greater use today (January 2006).  Most notably, all
84 // Windows-specific encodings are not selected because they lack stability:
85 // Microsoft has changed the mappings instead of creating new versions.
86 //
87 // Please contact the groff list if you find the selection inadequate.
88
89 static const conversion
90 emacs_to_mime[] = {
91   {"ascii",                             "US-ASCII"},    // Emacs
92   {"big5",                              "Big5"},
93   {"chinese-big5",                      "Big5"},        // Emacs
94   {"chinese-euc",                       "GB2312"},      // XEmacs
95   {"chinese-iso-8bit",                  "GB2312"},      // Emacs
96   {"cn-big5",                           "Big5"},
97   {"cn-gb",                             "GB2312"},      // Emacs
98   {"cn-gb-2312",                        "GB2312"},
99   {"cp878",                             "KOI8-R"},      // Emacs
100   {"cp1047",                            "CP1047"},      // EBCDIC
101   {"csascii",                           "US-ASCII"},    // alias
102   {"csisolatin1",                       "ISO-8859-1"},  // alias
103   {"cyrillic-iso-8bit",                 "ISO-8859-5"},  // Emacs
104   {"cyrillic-koi8",                     "KOI8-R"},      // not KOI8!, Emacs
105   {"euc-china",                         "GB2312"},      // Emacs
106   {"euc-cn",                            "GB2312"},      // Emacs
107   {"euc-japan",                         "EUC-JP"},
108   {"euc-japan-1990",                    "EUC-JP"},      // Emacs
109   {"euc-jp",                            "EUC-JP"},
110   {"euc-korea",                         "EUC-KR"},
111   {"euc-kr",                            "EUC-KR"},
112   {"gb2312",                            "GB2312"},
113   {"greek-iso-8bit",                    "ISO-8859-7"},
114   {"iso-10646/utf8",                    "UTF-8"},       // alias
115   {"iso-10646/utf-8",                   "UTF-8"},       // alias
116   {"iso-8859-1",                        "ISO-8859-1"},
117   {"iso-8859-13",                       "ISO-8859-13"}, // Emacs
118   {"iso-8859-15",                       "ISO-8859-15"},
119   {"iso-8859-2",                        "ISO-8859-2"},
120   {"iso-8859-5",                        "ISO-8859-5"},
121   {"iso-8859-7",                        "ISO-8859-7"},
122   {"iso-8859-9",                        "ISO-8859-9"},
123   {"iso-latin-1",                       "ISO-8859-1"},
124   {"iso-latin-2",                       "ISO-8859-2"},  // Emacs
125   {"iso-latin-5",                       "ISO-8859-9"},  // Emacs
126   {"iso-latin-7",                       "ISO-8859-13"}, // Emacs
127   {"iso-latin-9",                       "ISO-8859-15"}, // Emacs
128   {"japanese-iso-8bit",                 "EUC-JP"},      // Emacs
129   {"japanese-euc",                      "EUC-JP"},      // XEmacs
130   {"jis8",                              "EUC-JP"},      // XEmacs
131   {"koi8",                              "KOI8-R"},      // not KOI8!, Emacs
132   {"koi8-r",                            "KOI8-R"},
133   {"korean-euc",                        "EUC-KR"},      // XEmacs
134   {"korean-iso-8bit",                   "EUC-KR"},      // Emacs
135   {"latin1",                            "ISO-8859-1"},  // alias
136   {"latin-0",                           "ISO-8859-15"}, // Emacs
137   {"latin-1",                           "ISO-8859-1"},  // Emacs
138   {"latin-2",                           "ISO-8859-2"},  // Emacs
139   {"latin-5",                           "ISO-8859-9"},  // Emacs
140   {"latin-7",                           "ISO-8859-13"}, // Emacs
141   {"latin-9",                           "ISO-8859-15"}, // Emacs
142   {"mule-utf-16",                       "UTF-16"},      // Emacs
143   {"mule-utf-16be",                     "UTF-16BE"},    // Emacs
144   {"mule-utf-16-be",                    "UTF-16BE"},    // Emacs
145   {"mule-utf-16be-with-signature",      "UTF-16"},      // Emacs, not UTF-16BE
146   {"mule-utf-16le",                     "UTF-16LE"},    // Emacs
147   {"mule-utf-16-le",                    "UTF-16LE"},    // Emacs
148   {"mule-utf-16le-with-signature",      "UTF-16"},      // Emacs, not UTF-16LE
149   {"mule-utf-8",                        "UTF-8"},       // Emacs
150   {"us-ascii",                          "US-ASCII"},    // Emacs
151   {"utf8",                              "UTF-8"},       // alias
152   {"utf-16",                            "UTF-16"},      // Emacs
153   {"utf-16be",                          "UTF-16BE"},    // Emacs
154   {"utf-16-be",                         "UTF-16BE"},    // Emacs
155   {"utf-16be-with-signature",           "UTF-16"},      // Emacs, not UTF-16BE
156   {"utf-16-be-with-signature",          "UTF-16"},      // Emacs, not UTF-16BE
157   {"utf-16le",                          "UTF-16LE"},    // Emacs
158   {"utf-16-le",                         "UTF-16LE"},    // Emacs
159   {"utf-16le-with-signature",           "UTF-16"},      // Emacs, not UTF-16LE
160   {"utf-16-le-with-signature",          "UTF-16"},      // Emacs, not UTF-16LE
161   {"utf-8",                             "UTF-8"},       // Emacs
162
163 //  {"alternativnyj",                   ""},            // ?
164 //  {"arabic-iso-8bit",                 "ISO-8859-6"},  // Emacs
165 //  {"binary",                          ""},            // --
166 //  {"chinese-hz",                      "HZ-GB-2312"},  // Emacs
167 //  {"chinese-iso-7bit",                "ISO-2022-CN"}, // Emacs
168 //  {"chinese-iso-8bit-with-esc",       ""},            // --
169 //  {"compound-text",                   ""},            // --
170 //  {"compound-text-with-extension",    ""},            // --
171 //  {"cp1125",                          "cp1125"},      // *
172 //  {"cp1250",                          "windows-1250"},// Emacs
173 //  {"cp1251",                          "windows-1251"},// Emacs
174 //  {"cp1252",                          "windows-1252"},// Emacs
175 //  {"cp1253",                          "windows-1253"},// Emacs
176 //  {"cp1254",                          "windows-1254"},// Emacs
177 //  {"cp1255",                          "windows-1255"},// Emacs
178 //  {"cp1256",                          "windows-1256"},// Emacs
179 //  {"cp1257",                          "windows-1257"},// Emacs
180 //  {"cp1258",                          "windows-1258"},// Emacs
181 //  {"cp437",                           "cp437"},       // Emacs
182 //  {"cp720",                           ""},            // not covered
183 //  {"cp737",                           "cp737"},       // *, Emacs
184 //  {"cp775",                           "cp775"},       // Emacs
185 //  {"cp850",                           "cp850"},       // Emacs
186 //  {"cp851",                           "cp851"},       // Emacs
187 //  {"cp852",                           "cp852"},       // Emacs
188 //  {"cp855",                           "cp855"},       // Emacs
189 //  {"cp857",                           "cp857"},       // Emacs
190 //  {"cp860",                           "cp860"},       // Emacs
191 //  {"cp861",                           "cp861"},       // Emacs
192 //  {"cp862",                           "cp862"},       // Emacs
193 //  {"cp863",                           "cp863"},       // Emacs
194 //  {"cp864",                           "cp864"},       // Emacs
195 //  {"cp865",                           "cp865"},       // Emacs
196 //  {"cp866",                           "cp866"},       // Emacs
197 //  {"cp866u",                          "cp1125"},      // *, Emacs
198 //  {"cp869",                           "cp869"},       // Emacs
199 //  {"cp874",                           "cp874"},       // *, Emacs
200 //  {"cp932",                           "cp932"},       // *, Emacs
201 //  {"cp936",                           "cp936"},       // Emacs
202 //  {"cp949",                           "cp949"},       // *, Emacs
203 //  {"cp950",                           "cp950"},       // *, Emacs
204 //  {"ctext",                           ""},            // --
205 //  {"ctext-no-compositions",           ""},            // --
206 //  {"ctext-with-extensions",           ""},            // --
207 //  {"cyrillic-alternativnyj",          ""},            // ?, Emacs
208 //  {"cyrillic-iso-8bit-with-esc",      ""},            // --
209 //  {"cyrillic-koi8-t",                 "KOI8-T"},      // *, Emacs
210 //  {"devanagari",                      ""},            // not covered
211 //  {"dos",                             ""},            // --
212 //  {"emacs-mule",                      ""},            // --
213 //  {"euc-jisx0213",                    "EUC-JISX0213"},// *, XEmacs?
214 //  {"euc-jisx0213-with-esc",           ""},            // XEmacs?
215 //  {"euc-taiwan",                      "EUC-TW"},      // *, Emacs
216 //  {"euc-tw",                          "EUC-TW"},      // *, Emacs
217 //  {"georgian-ps",                     "GEORGIAN-PS"}, // *, Emacs
218 //  {"greek-iso-8bit-with-esc",         ""},            // --
219 //  {"hebrew-iso-8bit",                 "ISO-8859-8"},  // Emacs
220 //  {"hebrew-iso-8bit-with-esc",        ""},            // --
221 //  {"hz",                              "HZ-GB-2312"},
222 //  {"hz-gb-2312",                      "HZ-GB-2312"},
223 //  {"in-is13194",                      ""},            // not covered
224 //  {"in-is13194-devanagari",           ""},            // not covered
225 //  {"in-is13194-with-esc",             ""},            // --
226 //  {"iso-2022-7",                      ""},            // XEmacs?
227 //  {"iso-2022-7bit",                   ""},            // --
228 //  {"iso-2022-7bit-lock",              ""},            // --
229 //  {"iso-2022-7bit-lock-ss2",          ""},            // --
230 //  {"iso-2022-7bit-ss2",               ""},            // --
231 //  {"iso-2022-8",                      ""},            // XEmacs?
232 //  {"iso-2022-8bit",                   ""},            // XEmacs?
233 //  {"iso-2022-8bit-lock",              ""},            // XEmacs?
234 //  {"iso-2022-8bit-lock-ss2",          ""},            // XEmacs?
235 //  {"iso-2022-8bit-ss2",               ""},            // --
236 //  {"iso-2022-cjk",                    ""},            // --
237 //  {"iso-2022-cn",                     "ISO-2022-CN"}, // Emacs
238 //  {"iso-2022-cn-ext",                 "ISO-2022-CN-EXT"},// Emacs
239 //  {"iso-2022-int-1",                  ""},            // --
240 //  {"iso-2022-jp",                     "ISO-2022-JP"},
241 //  {"iso-2022-jp-1978-irv",            "ISO-2022-JP"},
242 //  {"iso-2022-jp-2",                   "ISO-2022-JP-2"},
243 //  {"iso-2022-jp-3",                   "ISO-2022-JP-3"},// *, XEmacs?
244 //  {"iso-2022-jp-3-compatible",        ""},            // XEmacs?
245 //  {"iso-2022-jp-3-strict",            "ISO-2022-JP-3"},// *, XEmacs?
246 //  {"iso-2022-kr",                     "ISO-2022-KR"},
247 //  {"iso-2022-lock",                   ""},            // XEmacs?
248 //  {"iso-8859-10",                     "ISO-8859-10"}, // Emacs
249 //  {"iso-8859-11",                     "ISO-8859-11"}, // *, Emacs
250 //  {"iso-8859-14",                     "ISO-8859-14"}, // Emacs
251 //  {"iso-8859-16",                     "ISO-8859-16"},
252 //  {"iso-8859-3",                      "ISO-8859-3"},
253 //  {"iso-8859-4",                      "ISO-8859-4"},
254 //  {"iso-8859-6",                      "ISO-8859-6"},
255 //  {"iso-8859-8",                      "ISO-8859-8"},
256 //  {"iso-8859-8-e",                    "ISO-8859-8"},
257 //  {"iso-8859-8-i",                    "ISO-8859-8"},  // Emacs
258 //  {"iso-latin-10",                    "ISO-8859-16"}, // Emacs
259 //  {"iso-latin-1-with-esc",            ""},            // --
260 //  {"iso-latin-2-with-esc",            ""},            // --
261 //  {"iso-latin-3",                     "ISO-8859-3"},  // Emacs
262 //  {"iso-latin-3-with-esc",            ""},            // --
263 //  {"iso-latin-4",                     "ISO-8859-4"},  // Emacs
264 //  {"iso-latin-4-with-esc",            ""},            // --
265 //  {"iso-latin-5-with-esc",            ""},            // --
266 //  {"iso-latin-6",                     "ISO-8859-10"}, // Emacs
267 //  {"iso-latin-8",                     "ISO-8859-14"}, // Emacs
268 //  {"iso-safe",                                ""},            // --
269 //  {"japanese-iso-7bit-1978-irv",      "ISO-2022-JP"}, // Emacs
270 //  {"japanese-iso-8bit-with-esc",      ""},            // --
271 //  {"japanese-shift-jis",              "Shift_JIS"},   // Emacs
272 //  {"japanese-shift-jisx0213",         ""},            // XEmacs?
273 //  {"jis7",                            "ISO-2022-JP"}, // Xemacs
274 //  {"junet",                           "ISO-2022-JP"},
275 //  {"koi8-t",                          "KOI8-T"},      // *, Emacs
276 //  {"koi8-u",                          "KOI8-U"},      // Emacs
277 //  {"korean-iso-7bit-lock",            "ISO-2022-KR"},
278 //  {"korean-iso-8bit-with-esc",        ""},            // --
279 //  {"lao",                             ""},            // not covered
280 //  {"lao-with-esc",                    ""},            // --
281 //  {"latin-10",                        "ISO-8859-16"}, // Emacs
282 //  {"latin-3",                         "ISO-8859-3"},  // Emacs
283 //  {"latin-4",                         "ISO-8859-4"},  // Emacs
284 //  {"latin-6",                         "ISO-8859-10"}, // Emacs
285 //  {"latin-8",                         "ISO-8859-14"}, // Emacs
286 //  {"mac",                             ""},            // --
287 //  {"mac-roman",                       "MACINTOSH"},   // Emacs
288 //  {"mik",                             ""},            // not covered
289 //  {"next",                            "NEXTSTEP"},    // *, Emacs
290 //  {"no-conversion",                   ""},            // --
291 //  {"old-jis",                         "ISO-2022-JP"},
292 //  {"pt154",                           "PT154"},       // Emacs
293 //  {"raw-text",                        ""},            // --
294 //  {"ruscii",                          "cp1125"},      // *, Emacs
295 //  {"shift-jis",                       "Shift_JIS"},   // XEmacs
296 //  {"shift_jis",                       "Shift_JIS"},
297 //  {"shift_jisx0213",                  "Shift_JISX0213"},// *, XEmacs?
298 //  {"sjis",                            "Shift_JIS"},   // Emacs
299 //  {"tcvn",                            "TCVN"},        // *, Emacs
300 //  {"tcvn-5712",                       "TCVN"},        // *, Emacs
301 //  {"thai-tis620",                     "TIS-620"},
302 //  {"thai-tis620-with-esc",            ""},            // --
303 //  {"th-tis620",                       "TIS-620"},
304 //  {"tibetan",                         ""},            // not covered
305 //  {"tibetan-iso-8bit",                ""},            // not covered
306 //  {"tibetan-iso-8bit-with-esc",       ""},            // --
307 //  {"tis-620",                         "TIS-620"},
308 //  {"tis620",                          "TIS-620"},
309 //  {"undecided",                       ""},            // --
310 //  {"unix",                            ""},            // --
311 //  {"utf-7",                           "UTF-7"},       // Emacs
312 //  {"utf-7-safe",                      ""},            // XEmacs?
313 //  {"utf-8-ws",                        "UTF-8"},       // XEmacs?
314 //  {"vietnamese-tcvn",                 "TCVN"},        // *, Emacs
315 //  {"vietnamese-viqr",                 "VIQR"},        // not covered
316 //  {"vietnamese-viscii",               "VISCII"},
317 //  {"vietnamese-vscii",                ""},            // not covered
318 //  {"viqr",                            "VIQR"},        // not covered
319 //  {"viscii",                          "VISCII"},
320 //  {"vscii",                           ""},            // not covered
321 //  {"windows-037",                     ""},            // not covered
322 //  {"windows-10000",                   ""},            // not covered
323 //  {"windows-10001",                   ""},            // not covered
324 //  {"windows-10006",                   ""},            // not covered
325 //  {"windows-10007",                   ""},            // not covered
326 //  {"windows-10029",                   ""},            // not covered
327 //  {"windows-10079",                   ""},            // not covered
328 //  {"windows-10081",                   ""},            // not covered
329 //  {"windows-1026",                    ""},            // not covered
330 //  {"windows-1200",                    ""},            // not covered
331 //  {"windows-1250",                    "windows-1250"},
332 //  {"windows-1251",                    "windows-1251"},
333 //  {"windows-1252",                    "windows-1252"},
334 //  {"windows-1253",                    "windows-1253"},
335 //  {"windows-1254",                    "windows-1254"},
336 //  {"windows-1255",                    "windows-1255"},
337 //  {"windows-1256",                    "windows-1256"},
338 //  {"windows-1257",                    "windows-1257"},
339 //  {"windows-1258",                    "windows-1258"},
340 //  {"windows-1361",                    "cp1361"},      // *, XEmacs
341 //  {"windows-437",                     "cp437"},       // XEmacs
342 //  {"windows-500",                     ""},            // not covered
343 //  {"windows-708",                     ""},            // not covered
344 //  {"windows-709",                     ""},            // not covered
345 //  {"windows-710",                     ""},            // not covered
346 //  {"windows-720",                     ""},            // not covered
347 //  {"windows-737",                     "cp737"},       // *, XEmacs
348 //  {"windows-775",                     "cp775"},       // XEmacs
349 //  {"windows-850",                     "cp850"},       // XEmacs
350 //  {"windows-852",                     "cp852"},       // XEmacs
351 //  {"windows-855",                     "cp855"},       // XEmacs
352 //  {"windows-857",                     "cp857"},       // XEmacs
353 //  {"windows-860",                     "cp860"},       // XEmacs
354 //  {"windows-861",                     "cp861"},       // XEmacs
355 //  {"windows-862",                     "cp862"},       // XEmacs
356 //  {"windows-863",                     "cp863"},       // XEmacs
357 //  {"windows-864",                     "cp864"},       // XEmacs
358 //  {"windows-865",                     "cp865"},       // XEmacs
359 //  {"windows-866",                     "cp866"},       // XEmacs
360 //  {"windows-869",                     "cp869"},       // XEmacs
361 //  {"windows-874",                     "cp874"},       // XEmacs
362 //  {"windows-875",                     ""},            // not covered
363 //  {"windows-932",                     "cp932"},       // *, XEmacs
364 //  {"windows-936",                     "cp936"},       // XEmacs
365 //  {"windows-949",                     "cp949"},       // *, XEmacs
366 //  {"windows-950",                     "cp950"},       // *, XEmacs
367 //  {"x-ctext",                         ""},            // --
368 //  {"x-ctext-with-extensions",         ""},            // --
369
370   {NULL,                                NULL},
371 };
372
373 // ---------------------------------------------------------
374 // Convert encoding name from emacs to mime.
375 // ---------------------------------------------------------
376 char *
377 emacs2mime(char *emacs_enc)
378 {
379   int emacs_enc_len = strlen(emacs_enc);
380   if (emacs_enc_len > 4
381       && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos"))
382     emacs_enc[emacs_enc_len - 4] = 0;
383   if (emacs_enc_len > 4
384       && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac"))
385     emacs_enc[emacs_enc_len - 4] = 0;
386   if (emacs_enc_len > 5
387       && !strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix"))
388     emacs_enc[emacs_enc_len - 5] = 0;
389   for (const conversion *table = emacs_to_mime; table->from; table++)
390     if (!strcasecmp(emacs_enc, table->from))
391       return (char *)table->to;
392   return emacs_enc;
393 }
394
395 // ---------------------------------------------------------
396 // Print out Unicode entity if value is greater than 0x7F.
397 // ---------------------------------------------------------
398 inline void
399 unicode_entity(int u)
400 {
401   if (u < 0x80)
402     putchar(u);
403   else {
404     // Handle soft hyphen specially -- it is an input character only,
405     // not a glyph.
406     if (u == 0xAD) {
407       putchar('\\');
408       putchar('%');
409     }
410     else
411       printf("\\[u%04X]", u);
412   }
413 }
414
415 // ---------------------------------------------------------
416 // Conversion functions.  All functions take `data', which
417 // normally holds the first two lines, and a file pointer.
418 // ---------------------------------------------------------
419
420 // Conversion from ISO-8859-1 (aka Latin-1) to Unicode.
421 void
422 conversion_latin1(FILE *fp, const string &data)
423 {
424   int len = data.length();
425   const unsigned char *ptr = (const unsigned char *)data.contents();
426   for (int i = 0; i < len; i++)
427     unicode_entity(ptr[i]);
428   int c = -1;
429   while ((c = getc(fp)) != EOF)
430     unicode_entity(c);
431 }
432
433 // A future version of groff shall support UTF-8 natively.
434 // In this case, the UTF-8 stuff here in this file will be
435 // moved to the troff program.
436
437 struct utf8 {
438   FILE *fp;
439   unsigned char s[6];
440   enum {
441     FIRST = 0,
442     SECOND,
443     THIRD,
444     FOURTH,
445     FIFTH,
446     SIXTH
447   } byte;
448   int expected_bytes;
449   int invalid_warning;
450   int incomplete_warning;
451   utf8(FILE *);
452   ~utf8();
453   void add(unsigned char);
454   void invalid();
455   void incomplete();
456 };
457
458 utf8::utf8(FILE *f) : fp(f), byte(FIRST), expected_bytes(1),
459                       invalid_warning(1), incomplete_warning(1)
460 {
461   // empty
462 }
463
464 utf8::~utf8()
465 {
466   if (byte != FIRST)
467     incomplete();
468 }
469
470 inline void
471 utf8::add(unsigned char c)
472 {
473   s[byte] = c;
474   if (byte == FIRST) {
475     if (c < 0x80)
476       unicode_entity(c);
477     else if (c < 0xC0)
478       invalid();
479     else if (c < 0xE0) {
480       expected_bytes = 2;
481       byte = SECOND;
482     }
483     else if (c < 0xF0) {
484       expected_bytes = 3;
485       byte = SECOND;
486     }
487     else if (c < 0xF8) {
488       expected_bytes = 4;
489       byte = SECOND;
490     }
491     else if (c < 0xFC) {
492       expected_bytes = 5;
493       byte = SECOND;
494     }
495     else if (c < 0xFE) {
496       expected_bytes = 6;
497       byte = SECOND;
498     }
499     else
500       invalid();
501     return;
502   }
503   if (c < 0x80 || c > 0xBF) {
504     incomplete();
505     add(c);
506     return;
507   }
508   switch (byte) {
509   case FIRST:
510     // can't happen
511     break;
512   case SECOND:
513     if (expected_bytes == 2) {
514       if (s[0] < 0xC2)
515         invalid();
516       else
517         unicode_entity(((s[0] & 0x1F) << 6)
518                        | (s[1] ^ 0x80));
519       byte = FIRST;
520     }
521     else
522       byte = THIRD;
523     break;
524   case THIRD:
525     if (expected_bytes == 3) {
526       if (!(s[0] >= 0xE1 || s[1] >= 0xA0))
527         invalid();
528       else
529         unicode_entity(((s[0] & 0x1F) << 12)
530                        | ((s[1] ^ 0x80) << 6)
531                        | (s[2] ^ 0x80));
532       byte = FIRST;
533     }
534     else
535       byte = FOURTH;
536     break;
537   case FOURTH:
538     // We reject everything greater than 0x10FFFF.
539     if (expected_bytes == 4) {
540       if (!((s[0] >= 0xF1 || s[1] >= 0x90)
541             && (s[0] < 0xF4 || (s[0] == 0xF4 && s[1] < 0x90))))
542         invalid();
543       else
544         unicode_entity(((s[0] & 0x07) << 18)
545                        | ((s[1] ^ 0x80) << 12)
546                        | ((s[2] ^ 0x80) << 6)
547                        | (s[3] ^ 0x80));
548       byte = FIRST;
549     }
550     else
551       byte = FIFTH;
552     break;
553   case FIFTH:
554     if (expected_bytes == 5) {
555       invalid();
556       byte = FIRST;
557     }
558     else
559       byte = SIXTH;
560     break;
561   case SIXTH:
562     invalid();
563     byte = FIRST;
564     break;
565   }
566 }
567
568 void
569 utf8::invalid()
570 {
571   if (debug_flag && invalid_warning) {
572     fprintf(stderr, "  invalid byte(s) found in input stream --\n"
573                     "  each such sequence replaced with 0xFFFD\n");
574     invalid_warning = 0;
575   }
576   unicode_entity(0xFFFD);
577   byte = FIRST;
578 }
579
580 void
581 utf8::incomplete()
582 {
583   if (debug_flag && incomplete_warning) {
584     fprintf(stderr, "  incomplete sequence(s) found in input stream --\n"
585                     "  each such sequence replaced with 0xFFFD\n");
586     incomplete_warning = 0;
587   }
588   unicode_entity(0xFFFD);
589   byte = FIRST;
590 }
591
592 // Conversion from UTF-8 to Unicode.
593 void
594 conversion_utf8(FILE *fp, const string &data)
595 {
596   utf8 u(fp);
597   int len = data.length();
598   const unsigned char *ptr = (const unsigned char *)data.contents();
599   for (int i = 0; i < len; i++)
600     u.add(ptr[i]);
601   int c = -1;
602   while ((c = getc(fp)) != EOF)
603     u.add(c);
604   return;
605 }
606
607 // Conversion from cp1047 (EBCDIC) to UTF-8.
608 void
609 conversion_cp1047(FILE *fp, const string &data)
610 {
611   static unsigned char cp1047[] = {
612     0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F,     // 0x00
613     0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
614     0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87,     // 0x10
615     0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
616     0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B,     // 0x20
617     0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
618     0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04,     // 0x30
619     0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A,
620     0x20, 0xA0, 0xE2, 0xE4, 0xE0, 0xE1, 0xE3, 0xE5,     // 0x40
621     0xE7, 0xF1, 0xA2, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
622     0x26, 0xE9, 0xEA, 0xEB, 0xE8, 0xED, 0xEE, 0xEF,     // 0x50
623     0xEC, 0xDF, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
624     0x2D, 0x2F, 0xC2, 0xC4, 0xC0, 0xC1, 0xC3, 0xC5,     // 0x60
625     0xC7, 0xD1, 0xA6, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
626     0xF8, 0xC9, 0xCA, 0xCB, 0xC8, 0xCD, 0xCE, 0xCF,     // 0x70
627     0xCC, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
628     0xD8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,     // 0x80
629     0x68, 0x69, 0xAB, 0xBB, 0xF0, 0xFD, 0xFE, 0xB1,
630     0xB0, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70,     // 0x90
631     0x71, 0x72, 0xAA, 0xBA, 0xE6, 0xB8, 0xC6, 0xA4,
632     0xB5, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,     // 0xA0
633     0x79, 0x7A, 0xA1, 0xBF, 0xD0, 0x5B, 0xDE, 0xAE,
634     0xAC, 0xA3, 0xA5, 0xB7, 0xA9, 0xA7, 0xB6, 0xBC,     // 0xB0
635     0xBD, 0xBE, 0xDD, 0xA8, 0xAF, 0x5D, 0xB4, 0xD7,
636     0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,     // 0xC0
637     0x48, 0x49, 0xAD, 0xF4, 0xF6, 0xF2, 0xF3, 0xF5,
638     0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50,     // 0xD0
639     0x51, 0x52, 0xB9, 0xFB, 0xFC, 0xF9, 0xFA, 0xFF,
640     0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,     // 0xE0
641     0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5,
642     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,     // 0xF0
643     0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F,
644   };
645   int len = data.length();
646   const unsigned char *ptr = (const unsigned char *)data.contents();
647   for (int i = 0; i < len; i++)
648     unicode_entity(cp1047[ptr[i]]);
649   int c = -1;
650   while ((c = getc(fp)) != EOF)
651     unicode_entity(cp1047[c]);
652 }
653
654 // Locale-sensible conversion.
655 #if HAVE_ICONV
656 void
657 conversion_iconv(FILE *fp, const string &data, char *enc)
658 {
659   iconv_t handle = iconv_open(UNICODE, enc);
660   if (handle == (iconv_t)-1) {
661     if (errno == EINVAL) {
662       error("encoding system `%1' not supported by iconv()", enc);
663       return;
664     }
665     fatal("iconv_open failed");
666   }
667   char inbuf[BUFSIZ];
668   int outbuf[BUFSIZ];
669   char *outptr = (char *)outbuf;
670   size_t outbytes_left = BUFSIZ * sizeof (int);
671   // Handle `data'.
672   char *inptr = (char *)data.contents();
673   size_t inbytes_left = data.length();
674   char *limit;
675   while (inbytes_left > 0) {
676     size_t status = iconv(handle,
677                           (ICONV_CONST char **)&inptr, &inbytes_left,
678                           &outptr, &outbytes_left);
679     if (status == (size_t)-1) {
680       if (errno == EILSEQ) {
681         // Invalid byte sequence.  XXX
682         inptr++;
683         inbytes_left--;
684       }
685       else if (errno == E2BIG) {
686         // Output buffer is full.
687         limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
688         for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
689           unicode_entity(*ptr);
690         memmove(outbuf, outptr, outbytes_left);
691         outptr = (char *)outbuf + outbytes_left;
692         outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
693       }
694       else if (errno == EINVAL) {
695         // `data' ends with partial input sequence.
696         memcpy(inbuf, inptr, inbytes_left);
697         break;
698       }
699     }
700   }
701   // Handle `fp' and switch to `inbuf'.
702   size_t read_bytes;
703   char *read_start = inbuf + inbytes_left;
704   while ((read_bytes = fread(read_start, 1, BUFSIZ - inbytes_left, fp)) > 0) {
705     inptr = inbuf;
706     inbytes_left += read_bytes;
707     while (inbytes_left > 0) {
708       size_t status = iconv(handle,
709                             (ICONV_CONST char **)&inptr, &inbytes_left,
710                             &outptr, &outbytes_left);
711       if (status == (size_t)-1) {
712         if (errno == EILSEQ) {
713           // Invalid byte sequence.  XXX
714           inptr++;
715           inbytes_left--;
716         }
717         else if (errno == E2BIG) {
718           // Output buffer is full.
719           limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
720           for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
721             unicode_entity(*ptr);
722           memmove(outbuf, outptr, outbytes_left);
723           outptr = (char *)outbuf + outbytes_left;
724           outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
725         }
726         else if (errno == EINVAL) {
727           // `inbuf' ends with partial input sequence.
728           memmove(inbuf, inptr, inbytes_left);
729           break;
730         }
731       }
732     }
733     read_start = inbuf + inbytes_left;
734   }
735   iconv_close(handle);
736   // XXX use ferror?
737   limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
738   for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
739     unicode_entity(*ptr);
740 }
741 #endif /* HAVE_ICONV */
742
743 // ---------------------------------------------------------
744 // Handle Byte Order Mark.
745 //
746 // Since we have a chicken-and-egg problem it's necessary
747 // to handle the BOM manually if it is in the data stream.
748 // As documented in the Unicode book it is very unlikely
749 // that any normal text file (regardless of the encoding)
750 // starts with the bytes which represent a BOM.
751 //
752 // Return the BOM in string `BOM'; `data' then starts with
753 // the byte after the BOM.  This function reads (at most)
754 // four bytes from the data stream.
755 //
756 // Return encoding if a BOM is found, NULL otherwise.
757 // ---------------------------------------------------------
758 const char *
759 get_BOM(FILE *fp, string &BOM, string &data)
760 {
761   // The BOM is U+FEFF.  We have thus the following possible
762   // representations.
763   //
764   //   UTF-8: 0xEFBBBF
765   //   UTF-16: 0xFEFF or 0xFFFE
766   //   UTF-32: 0x0000FEFF or 0xFFFE0000
767   static struct {
768     int len;
769     const char *str;
770     const char *name;
771   } BOM_table[] = {
772     {4, "\x00\x00\xFE\xFF", "UTF-32"},
773     {4, "\xFF\xFE\x00\x00", "UTF-32"},
774     {3, "\xEF\xBB\xBF", "UTF-8"},
775     {2, "\xFE\xFF", "UTF-16"},
776     {2, "\xFF\xFE", "UTF-16"},
777   };
778   const int BOM_table_len = sizeof (BOM_table) / sizeof (BOM_table[0]);
779   char BOM_string[4];
780   const char *retval = NULL;
781   int len;
782   for (len = 0; len < 4; len++) {
783     int c = getc(fp);
784     if (c == EOF)
785       break;
786     BOM_string[len] = char(c);
787   }
788   int i;
789   for (i = 0; i < BOM_table_len; i++) {
790     if (BOM_table[i].len <= len
791         && memcmp(BOM_string, BOM_table[i].str, BOM_table[i].len) == 0)
792       break;
793   }
794   int j = 0;
795   if (i < BOM_table_len) {
796     for (; j < BOM_table[i].len; j++)
797       BOM += BOM_string[j];
798     retval = BOM_table[i].name;
799   }
800   for (; j < len; j++)
801     data += BOM_string[j];
802   return retval;
803 }
804
805 // ---------------------------------------------------------
806 // Get first two lines from input stream.
807 //
808 // Return string (allocated with `new') without zero bytes
809 // or NULL in case no coding tag can occur in the data
810 // (which is stored unmodified in `data').
811 // ---------------------------------------------------------
812 char *
813 get_tag_lines(FILE *fp, string &data)
814 {
815   int newline_count = 0;
816   int c, prev = -1;
817   // Handle CR, LF, and CRLF as line separators.
818   for (int i = 0; i < data.length(); i++) {
819     c = data[i];
820     if (c == '\n' || c == '\r')
821       newline_count++;
822     if (c == '\n' && prev == '\r')
823       newline_count--;
824     prev = c;
825   }
826   if (newline_count > 1)
827     return NULL;
828   int emit_warning = 1;
829   for (int lines = newline_count; lines < 2; lines++) {
830     while ((c = getc(fp)) != EOF) {
831       if (c == '\0' && debug_flag && emit_warning) {
832         fprintf(stderr,
833                 "  null byte(s) found in input stream --\n"
834                 "  search for coding tag might return false result\n");
835         emit_warning = 0;
836       }
837       data += char(c);
838       if (c == '\n' || c == '\r')
839         break;
840     }
841     // Handle CR, LF, and CRLF as line separators.
842     if (c == '\r') {
843       c = getc(fp);
844       if (c != EOF && c != '\n')
845         ungetc(c, fp);
846       else
847         data += char(c);
848     }
849   }
850   return data.extract();
851 }
852
853 // ---------------------------------------------------------
854 // Check whether C string starts with a comment.
855 //
856 // Return 1 if true, 0 otherwise.
857 // ---------------------------------------------------------
858 int
859 is_comment_line(char *s)
860 {
861   if (!s || !*s)
862     return 0;
863   if (*s == '.' || *s == '\'')
864   {
865     s++;
866     while (*s == ' ' || *s == '\t')
867       s++;
868     if (*s && *s == '\\')
869     {
870       s++;
871       if (*s == '"' || *s == '#')
872         return 1;
873     }
874   }
875   else if (*s == '\\')
876   {
877     s++;
878     if (*s == '#')
879       return 1;
880   }
881   return 0;
882 }
883
884 // ---------------------------------------------------------
885 // Get a value/variable pair from a local variables list
886 // in a C string which look like this:
887 //
888 //   <variable1>: <value1>; <variable2>: <value2>; ...
889 //
890 // Leading and trailing blanks are ignored.  There might be
891 // more than one blank after `:' and `;'.
892 //
893 // Return position of next value/variable pair or NULL if
894 // at end of data.
895 // ---------------------------------------------------------
896 char *
897 get_variable_value_pair(char *d1, char **variable, char **value)
898 {
899   static char var[MAX_VAR_LEN], val[MAX_VAR_LEN];
900   *variable = var;
901   *value = val;
902   while (*d1 == ' ' || *d1 == '\t')
903     d1++;
904   // Get variable.
905   int l = 0;
906   while (l < MAX_VAR_LEN - 1 && *d1 && !strchr(";: \t", *d1))
907     var[l++] = *(d1++);
908   var[l] = 0;
909   // Skip everything until `:', `;', or end of data.
910   while (*d1 && *d1 != ':' && *d1 != ';')
911     d1++;
912   val[0] = 0;
913   if (!*d1)
914     return NULL;
915   if (*d1 == ';')
916     return d1 + 1;
917   d1++;
918   while (*d1 == ' ' || *d1 == '\t')
919     d1++;
920   // Get value.
921   l = 0;
922   while (l < MAX_VAR_LEN - 1 && *d1 && !strchr("; \t", *d1))
923     val[l++] = *(d1++);
924   val[l] = 0;
925   // Skip everything until `;' or end of data.
926   while (*d1 && *d1 != ';')
927     d1++;
928   if (*d1 == ';')
929     return d1 + 1;
930   return NULL;
931 }
932
933 // ---------------------------------------------------------
934 // Check coding tag in the read buffer.
935 //
936 // We search for the following line:
937 //
938 //   <comment> ... -*-<local variables list>-*-
939 //
940 // (`...' might be anything).
941 //
942 // <comment> can be one of the following syntax forms at the
943 // beginning of the line:
944 //
945 //   .\"   .\#   '\"   '\#   \#
946 //
947 // There can be whitespace after the leading `.' or "'".
948 //
949 // The local variables list must occur within the first
950 // comment block at the very beginning of the data stream.
951 //
952 // Within the <local variables list>, we search for
953 //
954 //   coding: <value>
955 //
956 // which specifies the coding system used for the data
957 // stream.
958 //
959 // Return <value> if found, NULL otherwise.
960 //
961 // Note that null bytes in the data are skipped before applying
962 // the algorithm.  This should work even with files encoded as
963 // UTF-16 or UTF-32 (or its siblings) in most cases.
964 //
965 // XXX Add support for tag at the end of buffer.
966 // ---------------------------------------------------------
967 char *
968 check_coding_tag(FILE *fp, string &data)
969 {
970   char *inbuf = get_tag_lines(fp, data);
971   char *lineend;
972   for (char *p = inbuf; is_comment_line(p); p = lineend + 1) {
973     if ((lineend = strchr(p, '\n')) == NULL)
974       break;
975     *lineend = 0;               // switch temporarily to '\0'
976     char *d1 = strstr(p, "-*-");
977     char *d2 = 0;
978     if (d1)
979       d2 = strstr(d1 + 3, "-*-");
980     *lineend = '\n';            // restore newline
981     if (!d1 || !d2)
982       continue;
983     *d2 = 0;                    // switch temporarily to '\0'
984     d1 += 3;
985     while (d1) {
986       char *variable, *value;
987       d1 = get_variable_value_pair(d1, &variable, &value);
988       if (!strcasecmp(variable, "coding")) {
989         *d2 = '-';              // restore '-'
990         a_delete inbuf;
991         return value;
992       }
993     }
994     *d2 = '-';                  // restore '-'
995   }
996   a_delete inbuf;
997   return NULL;
998 }
999
1000 // ---------------------------------------------------------
1001 // Handle an input file.  If filename is `-' handle stdin.
1002 //
1003 // Return 1 on success, 0 otherwise.
1004 // ---------------------------------------------------------
1005 int
1006 do_file(const char *filename)
1007 {
1008   FILE *fp;
1009   string BOM, data;
1010   if (strcmp(filename, "-")) {
1011     if (debug_flag)
1012       fprintf(stderr, "file `%s':\n", filename);
1013     fp = fopen(filename, FOPEN_RB);
1014     if (!fp) {
1015       error("can't open `%1': %2", filename, strerror(errno));
1016       return 0;
1017     }
1018   }
1019   else {
1020     if (debug_flag)
1021       fprintf(stderr, "standard input:\n");
1022     SET_BINARY(fileno(stdin));
1023     fp = stdin;
1024   }
1025   const char *BOM_encoding = get_BOM(fp, BOM, data);
1026   // Determine the encoding.
1027   char *encoding;
1028   if (user_encoding[0]) {
1029     if (debug_flag) {
1030       fprintf(stderr, "  user-specified encoding `%s', "
1031                       "no search for coding tag\n",
1032                       user_encoding);
1033       if (BOM_encoding && strcmp(BOM_encoding, user_encoding))
1034         fprintf(stderr, "  but BOM in data stream implies encoding `%s'!\n",
1035                         BOM_encoding);
1036     }
1037     encoding = (char *)user_encoding;
1038   }
1039   else if (BOM_encoding) {
1040     if (debug_flag)
1041       fprintf(stderr, "  found BOM, no search for coding tag\n");
1042     encoding = (char *)BOM_encoding;
1043   }
1044   else {
1045     // `check_coding_tag' returns a pointer to a static array (or NULL).
1046     char *file_encoding = check_coding_tag(fp, data);
1047     if (!file_encoding) {
1048       if (debug_flag)
1049         fprintf(stderr, "  no file encoding\n");
1050       file_encoding = default_encoding;
1051     }
1052     else
1053       if (debug_flag)
1054         fprintf(stderr, "  file encoding: `%s'\n", file_encoding);
1055     encoding = file_encoding;
1056   }
1057   strncpy(encoding_string, encoding, MAX_VAR_LEN - 1);
1058   encoding_string[MAX_VAR_LEN - 1] = 0;
1059   encoding = encoding_string;
1060   // Translate from MIME & Emacs encoding names to locale encoding names.
1061   encoding = emacs2mime(encoding_string);
1062   if (encoding[0] == '\0') {
1063     error("encoding `%1' not supported, not a portable encoding",
1064           encoding_string);
1065     return 0;
1066   }
1067   if (debug_flag)
1068     fprintf(stderr, "  encoding used: `%s'\n", encoding);
1069   if (!raw_flag)
1070     printf(".lf 1 %s\n", filename);
1071   int success = 1;
1072   // Call converter (converters write to stdout).
1073   if (!strcasecmp(encoding, "ISO-8859-1"))
1074     conversion_latin1(fp, BOM + data);
1075   else if (!strcasecmp(encoding, "UTF-8"))
1076     conversion_utf8(fp, data);
1077   else if (!strcasecmp(encoding, "cp1047"))
1078     conversion_cp1047(fp, BOM + data);
1079   else {
1080 #if HAVE_ICONV
1081     conversion_iconv(fp, BOM + data, encoding);
1082 #else
1083     error("encoding system `%1' not supported", encoding);
1084     success = 0;
1085 #endif /* HAVE_ICONV */
1086   }
1087   if (fp != stdin)
1088     fclose(fp);
1089   return success;
1090 }
1091
1092 // ---------------------------------------------------------
1093 // Print usage.
1094 // ---------------------------------------------------------
1095 void
1096 usage(FILE *stream)
1097 {
1098   fprintf(stream, "usage: %s [ option ] [ files ]\n"
1099                   "\n"
1100                   "-d           show debugging messages\n"
1101                   "-D encoding  specify default encoding\n"
1102                   "-e encoding  specify input encoding\n"
1103                   "-h           print this message\n"
1104                   "-r           don't add .lf requests\n"
1105                   "-v           print version number\n"
1106                   "\n"
1107                   "The default encoding is `%s'.\n",
1108                   program_name, default_encoding);
1109 }
1110
1111 // ---------------------------------------------------------
1112 // Main routine.
1113 // ---------------------------------------------------------
1114 int
1115 main(int argc, char **argv)
1116 {
1117   program_name = argv[0];
1118   // Determine the default encoding.  This must be done before
1119   // getopt() is called since the usage message shows the default
1120   // encoding.
1121   setlocale(LC_ALL, "");
1122   char *locale = getlocale(LC_CTYPE);
1123   if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX"))
1124     strcpy(default_encoding, "latin1");
1125   else {
1126     strncpy(default_encoding, locale_charset(), MAX_VAR_LEN - 1);
1127     default_encoding[MAX_VAR_LEN - 1] = 0;
1128   }
1129
1130   program_name = argv[0];
1131   int opt;
1132   static const struct option long_options[] = {
1133     { "help", no_argument, 0, 'h' },
1134     { "version", no_argument, 0, 'v' },
1135     { NULL, 0, 0, 0 }
1136   };
1137   // Parse the command line options.
1138   while ((opt = getopt_long(argc, argv,
1139                             "dD:e:hrv", long_options, NULL)) != EOF)
1140     switch (opt) {
1141     case 'v':
1142       printf("GNU preconv (groff) version %s %s iconv support\n",
1143              Version_string,
1144 #ifdef HAVE_ICONV
1145              "with"
1146 #else
1147              "without"
1148 #endif /* HAVE_ICONV */
1149             );
1150       exit(0);
1151       break;
1152     case 'd':
1153       debug_flag = 1;
1154       break;
1155     case 'e':
1156       if (optarg) {
1157         strncpy(user_encoding, optarg, MAX_VAR_LEN - 1);
1158         user_encoding[MAX_VAR_LEN - 1] = 0;
1159       }
1160       else
1161         user_encoding[0] = 0;
1162       break;
1163     case 'D':
1164       if (optarg) {
1165         strncpy(default_encoding, optarg, MAX_VAR_LEN - 1);
1166         default_encoding[MAX_VAR_LEN - 1] = 0;
1167       }
1168       break;
1169     case 'r':
1170       raw_flag = 1;
1171       break;
1172     case 'h':
1173       usage(stdout);
1174       exit(0);
1175       break;
1176     case '?':
1177       usage(stderr);
1178       exit(1);
1179       break;
1180     default:
1181       assert(0);
1182     }
1183   int nbad = 0;
1184   if (debug_flag)
1185     fprintf(stderr, "default encoding: `%s'\n", default_encoding);
1186   if (optind >= argc)
1187     nbad += !do_file("-");
1188   else
1189     for (int i = optind; i < argc; i++)
1190       nbad += !do_file(argv[i]);
1191   if (ferror(stdout) || fflush(stdout) < 0)
1192     fatal("output error");
1193   return nbad != 0;
1194 }
1195
1196 /* end of preconv.cpp */