2 Copyright (C) 1992, 1997-2002, 2004-2014 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
19 /* Messy DOS-specific code for correctly treating binary, Unix text
22 This has several aspects:
24 * Guessing the file type (unless the user tells us);
25 * Stripping CR characters from DOS text files (otherwise regex
26 functions won't work correctly);
27 * Reporting correct byte count with -b for any kind of file.
34 UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT
38 off_t pos; /* position in buffer passed to matcher */
39 off_t add; /* how much to add when reporting char position */
42 static int dos_report_unix_offset = 0;
44 static File_type dos_file_type = UNKNOWN;
45 static File_type dos_use_file_type = UNKNOWN;
46 static off_t dos_stripped_crs = 0;
47 static struct dos_map *dos_pos_map;
48 static int dos_pos_map_size = 0;
49 static int dos_pos_map_used = 0;
50 static int inp_map_idx = 0, out_map_idx = 1;
52 /* Set default DOS file type to binary. */
57 dos_use_file_type = DOS_BINARY;
60 /* Tell DOS routines to report Unix offset. */
62 dos_unix_byte_offsets (void)
65 dos_report_unix_offset = 1;
68 /* Guess DOS file type by looking at its contents. */
70 guess_type (char *buf, size_t buflen)
77 /* Treat a file as binary if it has a NUL character. */
81 /* CR before LF means DOS text file (unless we later see
82 binary characters). */
83 else if (*bp == '\r' && buflen && bp[1] == '\n')
89 return crlf_seen ? DOS_TEXT : UNIX_TEXT;
92 /* Convert external DOS file representation to internal.
93 Return the count of characters left in the buffer.
94 Build table to map character positions when reporting byte counts. */
96 undossify_input (char *buf, size_t buflen)
105 /* New file: forget everything we knew about character
106 position mapping table and file type. */
109 dos_pos_map_used = 0;
110 dos_stripped_crs = 0;
111 dos_file_type = dos_use_file_type;
114 /* Guess if this file is binary, unless we already know that. */
115 if (dos_file_type == UNKNOWN)
116 dos_file_type = guess_type(buf, buflen);
118 /* If this file is to be treated as DOS Text, strip the CR characters
119 and maybe build the table for character position mapping on output. */
120 if (dos_file_type == DOS_TEXT)
134 if (out_byte && !dos_report_unix_offset)
137 while (buflen && *buf == '\r')
143 if (inp_map_idx >= dos_pos_map_size - 1)
145 dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000;
146 dos_pos_map = xrealloc(dos_pos_map,
148 sizeof(struct dos_map));
153 /* Add sentinel entry. */
154 dos_pos_map[inp_map_idx].pos = 0;
155 dos_pos_map[inp_map_idx++].add = 0;
157 /* Initialize first real entry. */
158 dos_pos_map[inp_map_idx].add = 0;
161 /* Put the new entry. If the stripped CR characters
162 precede a Newline (the usual case), pretend that
163 they were found *after* the Newline. This makes
164 displayed byte offsets more reasonable in some
165 cases, and fits better the intuitive notion that
166 the line ends *before* the CR, not *after* it. */
168 dos_pos_map[inp_map_idx-1].pos =
169 (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc;
170 dos_pos_map[inp_map_idx].add = dos_stripped_crs;
171 dos_pos_map_used = inp_map_idx;
173 /* The following will be updated on the next pass. */
174 dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1;
185 /* Convert internal byte count into external. */
187 dossified_pos (off_t byteno)
195 if (dos_file_type != DOS_TEXT || dos_report_unix_offset)
198 /* Optimization: usually the file will be scanned sequentially.
199 So in most cases, this byte position will be found in the
200 table near the previous one, as recorded in 'out_map_idx'. */
201 pos_lo = dos_pos_map[out_map_idx-1].pos;
202 pos_hi = dos_pos_map[out_map_idx].pos;
204 /* If the initial guess failed, search up or down, as
205 appropriate, beginning with the previous place. */
206 if (byteno >= pos_hi)
209 while (out_map_idx < dos_pos_map_used
210 && byteno >= dos_pos_map[out_map_idx].pos)
214 else if (byteno < pos_lo)
217 while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos)
221 return byteno + dos_pos_map[out_map_idx].add;