#define CHARSET_MAX 41 static const char * getTok(const char **pp) { enum { inAtom, inString, init, inComment }; int state = init; const char *tokStart = 0; for (;;) { switch (**pp) { case '\0': return 0; case ' ': case '\r': case '\t': case '\n': if (state == inAtom) return tokStart; break; case '(': if (state == inAtom) return tokStart; if (state != inString) state++; break; case ')': if (state > init) --state; else if (state != inString) return 0; break; case ';': case '/': case '=': if (state == inAtom) return tokStart; if (state == init) return (*pp)++; break; case '\\': ++*pp; if (**pp == '\0') return 0; break; case '"': switch (state) { case inString: ++*pp; return tokStart; case inAtom: return tokStart; case init: tokStart = *pp; state = inString; break; } break; default: if (state == init) { tokStart = *pp; state = inAtom; } break; } ++*pp; } /* not reached */ } /* key must be lowercase ASCII */ static int matchkey(const char *start, const char *end, const char *key) { if (!start) return 0; for (; start != end; start++, key++) if (*start != *key && *start != 'A' + (*key - 'a')) return 0; return *key == '\0'; } void getXMLCharset(const char *buf, char *charset) { const char *next, *p; charset[0] = '\0'; next = buf; p = getTok(&next); if (matchkey(p, next, "text")) strcpy(charset, "us-ascii"); else if (!matchkey(p, next, "application")) return; p = getTok(&next); if (!p || *p != '/') return; p = getTok(&next); if (matchkey(p, next, "xml")) isXml = 1; p = getTok(&next); while (p) { if (*p == ';') { p = getTok(&next); if (matchkey(p, next, "charset")) { p = getTok(&next); if (p && *p == '=') { p = getTok(&next); if (p) { char *s = charset; if (*p == '"') { while (++p != next - 1) { if (*p == '\\') ++p; if (s == charset + CHARSET_MAX - 1) { charset[0] = '\0'; break; } *s++ = *p; } *s++ = '\0'; } else { if (next - p > CHARSET_MAX - 1) break; while (p != next) *s++ = *p++; *s = 0; break; } } } } } else p = getTok(&next); } } int main(int argc, char **argv) { char buf[CHARSET_MAX]; getXMLCharset(argv[1], buf); printf("charset = \"%s\"\n", buf); return 0; }