1 /* Character set conversion support for GDB.
3 Copyright (C) 2001, 2003, 2007, 2008, 2009 Free Software Foundation, Inc.
5 This file is part of GDB.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #include "gdb_assert.h"
24 #include "gdb_obstack.h"
26 #include "charset-list.h"
30 #include "gdb_string.h"
34 /* How GDB's character set support works
36 GDB has three global settings:
38 - The `current host character set' is the character set GDB should
39 use in talking to the user, and which (hopefully) the user's
40 terminal knows how to display properly. Most users should not
43 - The `current target character set' is the character set the
44 program being debugged uses.
46 - The `current target wide character set' is the wide character set
47 the program being debugged uses, that is, the encoding used for
50 There are commands to set each of these, and mechanisms for
51 choosing reasonable default values. GDB has a global list of
52 character sets that it can use as its host or target character
55 The header file `charset.h' declares various functions that
56 different pieces of GDB need to perform tasks like:
58 - printing target strings and characters to the user's terminal
59 (mostly target->host conversions),
61 - building target-appropriate representations of strings and
62 characters the user enters in expressions (mostly host->target
67 To avoid excessive code duplication and maintenance efforts,
68 GDB simply requires a capable iconv function. Users on platforms
69 without a suitable iconv can use the GNU iconv library. */
74 /* Provide a phony iconv that does as little as possible. Also,
75 arrange for there to be a single available character set. */
77 #undef GDB_DEFAULT_HOST_CHARSET
78 #define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
79 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
80 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "ISO-8859-1"
81 #undef DEFAULT_CHARSET_NAMES
82 #define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
91 #define ICONV_CONST const
93 /* Some systems don't have EILSEQ, so we define it here, but not as
94 EINVAL, because callers of `iconv' want to distinguish EINVAL and
95 EILSEQ. This is what iconv.h from libiconv does as well. Note
96 that wchar.h may also define EILSEQ, so this needs to be after we
97 include wchar.h, which happens in defs.h through gdb_wchar.h. */
103 iconv_open (const char *to, const char *from)
105 /* We allow conversions from UCS-4BE, wchar_t, and the host charset.
106 We allow conversions to wchar_t and the host charset. */
107 if (strcmp (from, "UCS-4BE") && strcmp (from, "wchar_t")
108 && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
110 if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
113 /* Return 1 if we are converting from UCS-4BE, 0 otherwise. This is
114 used as a flag in calls to iconv. */
115 return !strcmp (from, "UCS-4BE");
119 iconv_close (iconv_t arg)
125 iconv (iconv_t ucs_flag, const char **inbuf, size_t *inbytesleft,
126 char **outbuf, size_t *outbytesleft)
130 while (*inbytesleft >= 4)
135 for (j = 0; j < 4; ++j)
138 c += (*inbuf)[j] & 0xff;
153 if (*inbytesleft < 4)
161 /* In all other cases we simply copy input bytes to the
163 size_t amt = *inbytesleft;
164 if (amt > *outbytesleft)
166 memcpy (*outbuf, *inbuf, amt);
170 *outbytesleft -= amt;
179 /* The number of non-reversible conversions -- but they were all
188 /* The global lists of character sets and translations. */
191 #ifndef GDB_DEFAULT_TARGET_CHARSET
192 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
195 #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
196 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UCS-4"
199 static const char *auto_host_charset_name = GDB_DEFAULT_HOST_CHARSET;
200 static const char *host_charset_name = "auto";
202 show_host_charset_name (struct ui_file *file, int from_tty,
203 struct cmd_list_element *c,
206 if (!strcmp (value, "auto"))
207 fprintf_filtered (file,
208 _("The host character set is \"auto; currently %s\".\n"),
209 auto_host_charset_name);
211 fprintf_filtered (file, _("The host character set is \"%s\".\n"), value);
214 static const char *target_charset_name = GDB_DEFAULT_TARGET_CHARSET;
216 show_target_charset_name (struct ui_file *file, int from_tty,
217 struct cmd_list_element *c, const char *value)
219 fprintf_filtered (file, _("The target character set is \"%s\".\n"),
223 static const char *target_wide_charset_name = GDB_DEFAULT_TARGET_WIDE_CHARSET;
225 show_target_wide_charset_name (struct ui_file *file, int from_tty,
226 struct cmd_list_element *c, const char *value)
228 fprintf_filtered (file, _("The target wide character set is \"%s\".\n"),
232 static const char *default_charset_names[] =
234 DEFAULT_CHARSET_NAMES
238 static const char **charset_enum;
241 /* If the target wide character set has big- or little-endian
242 variants, these are the corresponding names. */
243 static const char *target_wide_charset_be_name;
244 static const char *target_wide_charset_le_name;
246 /* A helper function for validate which sets the target wide big- and
247 little-endian character set names, if possible. */
250 set_be_le_names (void)
254 target_wide_charset_le_name = NULL;
255 target_wide_charset_be_name = NULL;
257 len = strlen (target_wide_charset_name);
258 for (i = 0; charset_enum[i]; ++i)
260 if (strncmp (target_wide_charset_name, charset_enum[i], len))
262 if ((charset_enum[i][len] == 'B'
263 || charset_enum[i][len] == 'L')
264 && charset_enum[i][len + 1] == 'E'
265 && charset_enum[i][len + 2] == '\0')
267 if (charset_enum[i][len] == 'B')
268 target_wide_charset_be_name = charset_enum[i];
270 target_wide_charset_le_name = charset_enum[i];
275 /* 'Set charset', 'set host-charset', 'set target-charset', 'set
276 target-wide-charset', 'set charset' sfunc's. */
282 const char *host_cset = host_charset ();
284 desc = iconv_open (target_wide_charset_name, host_cset);
285 if (desc == (iconv_t) -1)
286 error ("Cannot convert between character sets `%s' and `%s'",
287 target_wide_charset_name, host_cset);
290 desc = iconv_open (target_charset_name, host_cset);
291 if (desc == (iconv_t) -1)
292 error ("Cannot convert between character sets `%s' and `%s'",
293 target_charset_name, host_cset);
299 /* This is the sfunc for the 'set charset' command. */
301 set_charset_sfunc (char *charset, int from_tty, struct cmd_list_element *c)
303 /* CAREFUL: set the target charset here as well. */
304 target_charset_name = host_charset_name;
308 /* 'set host-charset' command sfunc. We need a wrapper here because
309 the function needs to have a specific signature. */
311 set_host_charset_sfunc (char *charset, int from_tty,
312 struct cmd_list_element *c)
317 /* Wrapper for the 'set target-charset' command. */
319 set_target_charset_sfunc (char *charset, int from_tty,
320 struct cmd_list_element *c)
325 /* Wrapper for the 'set target-wide-charset' command. */
327 set_target_wide_charset_sfunc (char *charset, int from_tty,
328 struct cmd_list_element *c)
333 /* sfunc for the 'show charset' command. */
335 show_charset (struct ui_file *file, int from_tty, struct cmd_list_element *c,
338 show_host_charset_name (file, from_tty, c, host_charset_name);
339 show_target_charset_name (file, from_tty, c, target_charset_name);
340 show_target_wide_charset_name (file, from_tty, c, target_wide_charset_name);
344 /* Accessor functions. */
349 if (!strcmp (host_charset_name, "auto"))
350 return auto_host_charset_name;
351 return host_charset_name;
355 target_charset (void)
357 return target_charset_name;
361 target_wide_charset (enum bfd_endian byte_order)
363 if (byte_order == BFD_ENDIAN_BIG)
365 if (target_wide_charset_be_name)
366 return target_wide_charset_be_name;
370 if (target_wide_charset_le_name)
371 return target_wide_charset_le_name;
374 return target_wide_charset_name;
378 /* Host character set management. For the time being, we assume that
379 the host character set is some superset of ASCII. */
382 host_letter_to_control_character (char c)
389 /* Convert a host character, C, to its hex value. C must already have
390 been validated using isxdigit. */
393 host_hex_value (char c)
397 if (c >= 'a' && c <= 'f')
399 gdb_assert (c >= 'A' && c <= 'F');
404 /* Public character management functions. */
406 /* A cleanup function which is run to close an iconv descriptor. */
409 cleanup_iconv (void *p)
412 iconv_close (*descp);
416 convert_wchar (gdb_wchar_t **pinp, size_t *pinleft, char **poutp, size_t *poutleft)
418 char tmp[MB_CUR_MAX];
421 while (*pinleft >= sizeof(gdb_wchar_t))
423 r = wctomb(tmp, **pinp);
426 perror_with_name ("Internal error while converting character sets");
434 memcpy(*poutp, tmp, r);
438 *pinleft -= sizeof(gdb_wchar_t);
448 convert_between_encodings (const char *from, const char *to,
449 const gdb_byte *bytes, unsigned int num_bytes,
450 int width, struct obstack *output,
451 enum transliterations translit)
454 struct cleanup *cleanups;
457 unsigned int space_request;
460 /* Often, the host and target charsets will be the same. */
461 if (!strcmp (from, to))
463 obstack_grow (output, bytes, num_bytes);
467 if (!strcmp (from, "wchar_t"))
469 if (strcmp (to, host_charset ()))
470 perror_with_name ("Converting character sets");
471 cleanups = NULL; /* silence gcc complaints */
476 desc = iconv_open (to, from);
477 if (desc == (iconv_t) -1)
478 perror_with_name ("Converting character sets");
479 cleanups = make_cleanup (cleanup_iconv, &desc);
483 inp = (char *) bytes;
485 space_request = num_bytes;
493 old_size = obstack_object_size (output);
494 obstack_blank (output, space_request);
496 outp = obstack_base (output) + old_size;
497 outleft = space_request;
500 r = convert_wchar((gdb_wchar_t **)(void *)&inp, &inleft, &outp, &outleft);
502 r = iconv (desc, (ICONV_CONST char **) &inp, &inleft, &outp, &outleft);
504 /* Now make sure that the object on the obstack only includes
505 bytes we have converted. */
506 obstack_blank (output, - (int) outleft);
508 if (r == (size_t) -1)
516 /* Invalid input sequence. */
517 if (translit == translit_none)
518 error (_("Could not convert character to `%s' character set"),
521 /* We emit escape sequence for the bytes, skip them,
523 for (i = 0; i < width; ++i)
527 sprintf (octal, "\\%.3o", *inp & 0xff);
528 obstack_grow_str (output, octal);
537 /* We ran out of space in the output buffer. Make it
538 bigger next time around. */
543 /* Incomplete input sequence. FIXME: ought to report this
544 to the caller somehow. */
549 perror_with_name ("Internal error while converting character sets");
555 do_cleanups (cleanups);
560 /* An iterator that returns host wchar_t's from a target string. */
561 struct wchar_iterator
563 /* The underlying iconv descriptor. */
566 /* The input string. This is updated as convert characters. */
568 /* The number of bytes remaining in the input. */
571 /* The width of an input character. */
574 /* The intermediate buffer */
579 /* The output byte. */
583 /* Create a new iterator. */
584 struct wchar_iterator *
585 make_wchar_iterator (const gdb_byte *input, size_t bytes, const char *charset,
588 struct wchar_iterator *result;
591 desc = iconv_open (host_charset (), charset);
592 if (desc == (iconv_t) -1)
593 perror_with_name ("Converting character sets");
595 result = XNEW (struct wchar_iterator);
597 result->input = (char *) input;
598 result->bytes = bytes;
599 result->width = width;
601 result->inter = XNEW (char);
602 result->inter_size = 1;
603 result->inter_len = 0;
609 do_cleanup_iterator (void *p)
611 struct wchar_iterator *iter = p;
613 iconv_close (iter->desc);
619 make_cleanup_wchar_iterator (struct wchar_iterator *iter)
621 return make_cleanup (do_cleanup_iterator, iter);
625 wchar_iterate (struct wchar_iterator *iter,
626 enum wchar_iterate_result *out_result,
627 gdb_wchar_t **out_chars,
628 const gdb_byte **ptr,
632 char *orig_inptr = iter->input;
633 size_t orig_in = iter->bytes;
635 /* Try to convert some characters. At first we try to convert just
636 a single character. The reason for this is that iconv does not
637 necessarily update its outgoing arguments when it encounters an
638 invalid input sequence -- but we want to reliably report this to
639 our caller so it can emit an escape sequence. */
640 while (iter->inter_len == 0 && iter->bytes > 0)
643 while (iter->bytes > 0)
645 char *outptr = (char *) &iter->inter[iter->inter_len];
646 size_t out_avail = out_request;
648 size_t r = iconv (iter->desc,
649 (ICONV_CONST char **) &iter->input, &iter->bytes,
650 &outptr, &out_avail);
651 if (r == (size_t) -1)
656 /* Invalid input sequence. Skip it, and let the caller
658 *out_result = wchar_iterate_invalid;
661 iter->input += iter->width;
662 iter->bytes -= iter->width;
666 /* We ran out of space. We still might have converted a
667 character; if so, return it. Otherwise, grow the
668 buffer and try again. */
669 if (out_avail < out_request)
673 if (out_request > iter->inter_size)
675 iter->inter_size = out_request;
676 iter->inter = xrealloc (iter->inter, out_request);
681 /* Incomplete input sequence. Let the caller know, and
682 arrange for future calls to see EOF. */
683 *out_result = wchar_iterate_incomplete;
690 perror_with_name ("Internal error while converting character sets");
694 /* We converted something. */
695 iter->inter_len += out_request - out_avail;
700 if (iter->inter_len > 0)
704 /* Now convert from our charset to wchar_t */
705 r = mbtowc(&iter->out, &iter->inter[0], iter->inter_len);
707 /* This must never happen: we just converted to a valid charset! */
709 perror_with_name ("Internal error while converting character sets");
711 /* NUL bytes are alright */
715 iter->inter_len -= r;
716 memmove(&iter->inter[0], &iter->inter[r], iter->inter_len);
718 *out_result = wchar_iterate_ok;
719 *out_chars = &iter->out;
721 *len = orig_in - iter->bytes;
726 *out_result = wchar_iterate_eof;
731 /* The charset.c module initialization function. */
733 extern initialize_file_ftype _initialize_charset; /* -Wmissing-prototype */
735 typedef char *char_ptr;
736 DEF_VEC_P (char_ptr);
738 static VEC (char_ptr) *charsets;
743 find_charset_names (void)
745 VEC_safe_push (char_ptr, charsets, GDB_DEFAULT_HOST_CHARSET);
746 VEC_safe_push (char_ptr, charsets, NULL);
749 #else /* PHONY_ICONV */
751 /* Sometimes, libiconv redefines iconvlist as libiconvlist -- but
752 provides different symbols in the static and dynamic libraries.
753 So, configure may see libiconvlist but not iconvlist. But, calling
754 iconvlist is the right thing to do and will work. Hence we do a
755 check here but unconditionally call iconvlist below. */
756 #if defined (HAVE_ICONVLIST) || defined (HAVE_LIBICONVLIST)
758 /* A helper function that adds some character sets to the vector of
759 all character sets. This is a callback function for iconvlist. */
762 add_one (unsigned int count, const char *const *names, void *data)
766 for (i = 0; i < count; ++i)
767 VEC_safe_push (char_ptr, charsets, xstrdup (names[i]));
773 find_charset_names (void)
775 iconvlist (add_one, NULL);
776 VEC_safe_push (char_ptr, charsets, NULL);
782 find_charset_names (void)
784 struct pex_obj *child;
789 child = pex_init (0, "iconv", NULL);
794 /* Note that we simply ignore errors here. */
795 if (!pex_run (child, PEX_SEARCH | PEX_STDERR_TO_STDOUT, "iconv",
796 args, NULL, NULL, &err))
798 FILE *in = pex_read_output (child, 0);
800 /* POSIX says that iconv -l uses an unspecified format. We
801 parse the glibc and libiconv formats; feel free to add others
805 /* The size of buf is chosen arbitrarily. */
810 r = fgets (buf, sizeof (buf), in);
816 /* Strip off the newline. */
818 /* Strip off one or two '/'s. glibc will print lines like
819 "8859_7//", but also "10646-1:1993/UCS4/". */
820 if (buf[len - 1] == '/')
822 if (buf[len - 1] == '/')
826 /* libiconv will print multiple entries per line, separated
834 /* Find the next space, or end-of-line. */
835 for (p = start; *p && *p != ' '; ++p)
837 /* Ignore an empty result. */
842 VEC_safe_push (char_ptr, charsets, xstrdup (start));
845 /* Skip any extra spaces. */
846 for (start = p + 1; *start && *start == ' '; ++start)
851 if (pex_get_status (child, 1, &status)
852 && WIFEXITED (status) && !WEXITSTATUS (status))
861 /* Some error occurred, so drop the vector. */
864 for (ix = 0; VEC_iterate (char_ptr, charsets, ix, elt); ++ix)
866 VEC_truncate (char_ptr, charsets, 0);
869 VEC_safe_push (char_ptr, charsets, NULL);
872 #endif /* HAVE_ICONVLIST || HAVE_LIBICONVLIST */
873 #endif /* PHONY_ICONV */
876 _initialize_charset (void)
878 struct cmd_list_element *new_cmd;
880 /* The first element is always "auto"; then we skip it for the
881 commands where it is not allowed. */
882 VEC_safe_push (char_ptr, charsets, xstrdup ("auto"));
883 find_charset_names ();
885 if (VEC_length (char_ptr, charsets) > 1)
886 charset_enum = (const char **) VEC_address (char_ptr, charsets);
888 charset_enum = default_charset_names;
891 #ifdef HAVE_LANGINFO_CODESET
892 auto_host_charset_name = nl_langinfo (CODESET);
893 /* Solaris will return `646' here -- but the Solaris iconv then
894 does not accept this. */
895 if (!strcmp (auto_host_charset_name, "646"))
896 auto_host_charset_name = "ASCII";
897 target_charset_name = auto_host_charset_name;
903 add_setshow_enum_cmd ("charset", class_support,
904 &charset_enum[1], &host_charset_name, _("\
905 Set the host and target character sets."), _("\
906 Show the host and target character sets."), _("\
907 The `host character set' is the one used by the system GDB is running on.\n\
908 The `target character set' is the one used by the program being debugged.\n\
909 You may only use supersets of ASCII for your host character set; GDB does\n\
910 not support any others.\n\
911 To see a list of the character sets GDB supports, type `set charset <TAB>'."),
912 /* Note that the sfunc below needs to set
913 target_charset_name, because the 'set
914 charset' command sets two variables. */
917 &setlist, &showlist);
919 add_setshow_enum_cmd ("host-charset", class_support,
920 charset_enum, &host_charset_name, _("\
921 Set the host character set."), _("\
922 Show the host character set."), _("\
923 The `host character set' is the one used by the system GDB is running on.\n\
924 You may only use supersets of ASCII for your host character set; GDB does\n\
925 not support any others.\n\
926 To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
927 set_host_charset_sfunc,
928 show_host_charset_name,
929 &setlist, &showlist);
931 add_setshow_enum_cmd ("target-charset", class_support,
932 &charset_enum[1], &target_charset_name, _("\
933 Set the target character set."), _("\
934 Show the target character set."), _("\
935 The `target character set' is the one used by the program being debugged.\n\
936 GDB translates characters and strings between the host and target\n\
937 character sets as needed.\n\
938 To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
939 set_target_charset_sfunc,
940 show_target_charset_name,
941 &setlist, &showlist);
943 add_setshow_enum_cmd ("target-wide-charset", class_support,
944 &charset_enum[1], &target_wide_charset_name,
946 Set the target wide character set."), _("\
947 Show the target wide character set."), _("\
948 The `target wide character set' is the one used by the program being debugged.\n\
949 In particular it is the encoding used by `wchar_t'.\n\
950 GDB translates characters and strings between the host and target\n\
951 character sets as needed.\n\
952 To see a list of the character sets GDB supports, type\n\
953 `set target-wide-charset'<TAB>"),
954 set_target_wide_charset_sfunc,
955 show_target_wide_charset_name,
956 &setlist, &showlist);