From c167a99d5a382c4f2333ab29002af9b4d6722bb2 Mon Sep 17 00:00:00 2001
From: Bron Gondwana <brong@fastmail.fm>
Date: Mon, 23 Mar 2009 13:48:05 +1100
Subject: [PATCH] Complete rewrite of charset handling, using Perl

---
 imap/imapd.c              |   72 +--
 lib/Makefile.in           |   11 +-
 lib/charset.c             | 1904 +++++++++++++++++++++------------------------
 lib/charset.h             |   29 +-
 lib/charset/iso-2022-jp.t |   33 +-
 lib/charset/iso-2022-kr.t |   12 +-
 lib/chartable.h           |   27 +-
 lib/mkchartable.c         |  975 -----------------------
 lib/mkchartable.pl        |  531 +++++++++++++
 9 files changed, 1487 insertions(+), 2107 deletions(-)
 delete mode 100644 lib/mkchartable.c
 create mode 100644 lib/mkchartable.pl

diff --git a/imap/imapd.c b/imap/imapd.c
index 8acb36e..074d1ed 100644
--- a/imap/imapd.c
+++ b/imap/imapd.c
@@ -7608,26 +7608,16 @@ int parsecharset;
 	    c = getastring(imapd_in, imapd_out, &arg);
 	    if (c == EOF) goto missingarg;
 	    str = charset_convert(arg.s, *charset, NULL, 0);
-	    if (strchr(str, EMPTY)) {
-		/* Force failure */
-		searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
-	    }
-	    else {
-		appendstrlistpat(&searchargs->bcc, str);
-	    }
+	    if (str) appendstrlistpat(&searchargs->bcc, str);
+	    else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
 	}
 	else if (!strcmp(criteria.s, "body")) {
 	    if (c != ' ') goto missingarg;		
 	    c = getastring(imapd_in, imapd_out, &arg);
 	    if (c == EOF) goto missingarg;
 	    str = charset_convert(arg.s, *charset, NULL, 0);
-	    if (strchr(str, EMPTY)) {
-		/* Force failure */
-		searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
-	    }
-	    else {
-		appendstrlistpat(&searchargs->body, str);
-	    }
+	    if (str) appendstrlistpat(&searchargs->body, str);
+	    else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
 	}
 	else goto badcri;
 	break;
@@ -7638,13 +7628,8 @@ int parsecharset;
 	    c = getastring(imapd_in, imapd_out, &arg);
 	    if (c == EOF) goto missingarg;
 	    str = charset_convert(arg.s, *charset, NULL, 0);
-	    if (strchr(str, EMPTY)) {
-		/* Force failure */
-		searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
-	    }
-	    else {
-		appendstrlistpat(&searchargs->cc, str);
-	    }
+	    if (str) appendstrlistpat(&searchargs->cc, str);
+	    else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
 	}
 	else if (parsecharset && !strcmp(criteria.s, "charset")) {
 	    if (c != ' ') goto missingarg;		
@@ -7675,13 +7660,8 @@ int parsecharset;
 	    c = getastring(imapd_in, imapd_out, &arg);
 	    if (c == EOF) goto missingarg;
 	    str = charset_convert(arg.s, *charset, NULL, 0);
-	    if (strchr(str, EMPTY)) {
-		/* Force failure */
-		searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
-	    }
-	    else {
-		appendstrlistpat(&searchargs->from, str);
-	    }
+	    if (str) appendstrlistpat(&searchargs->from, str);
+	    else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
 	}
 	else goto badcri;
 	break;
@@ -7732,13 +7712,8 @@ int parsecharset;
 	    c = getastring(imapd_in, imapd_out, &arg);
 	    if (c == EOF) goto missingarg;
 	    str = charset_convert(arg.s, *charset, NULL, 0);
-	    if (strchr(str, EMPTY)) {
-		/* Force failure */
-		searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
-	    }
-	    else {
-		appendstrlistpat(patlist, str);
-	    }
+	    if (str) appendstrlistpat(patlist, str);
+	    else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
 	}
 	else goto badcri;
 	break;
@@ -7909,13 +7884,8 @@ int parsecharset;
 	    c = getastring(imapd_in, imapd_out, &arg);
 	    if (c == EOF) goto missingarg;
 	    str = charset_convert(arg.s, *charset, NULL, 0);
-	    if (strchr(str, EMPTY)) {
-		/* Force failure */
-		searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
-	    }
-	    else {
-		appendstrlistpat(&searchargs->subject, str);
-	    }
+	    if (str) appendstrlistpat(&searchargs->subject, str);
+	    else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
 	}
 	else goto badcri;
 	break;
@@ -7926,26 +7896,16 @@ int parsecharset;
 	    c = getastring(imapd_in, imapd_out, &arg);
 	    if (c == EOF) goto missingarg;
 	    str = charset_convert(arg.s, *charset, NULL, 0);
-	    if (strchr(str, EMPTY)) {
-		/* Force failure */
-		searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
-	    }
-	    else {
-		appendstrlistpat(&searchargs->to, str);
-	    }
+	    if (str) appendstrlistpat(&searchargs->to, str);
+	    else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
 	}
 	else if (!strcmp(criteria.s, "text")) {
 	    if (c != ' ') goto missingarg;		
 	    c = getastring(imapd_in, imapd_out, &arg);
 	    if (c == EOF) goto missingarg;
 	    str = charset_convert(arg.s, *charset, NULL, 0);
-	    if (strchr(str, EMPTY)) {
-		/* Force failure */
-		searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
-	    }
-	    else {
-		appendstrlistpat(&searchargs->text, str);
-	    }
+	    if (str) appendstrlistpat(&searchargs->text, str);
+	    else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET);
 	}
 	else goto badcri;
 	break;
diff --git a/lib/Makefile.in b/lib/Makefile.in
index 07f8e59..d5adb77 100644
--- a/lib/Makefile.in
+++ b/lib/Makefile.in
@@ -141,24 +141,19 @@ imapopts.c: imapoptions $(srcdir)/../tools/config2header
 
 imapopts.h: imapopts.c
 
-chartable.c: mkchartable 
+chartable.c: mkchartable.pl
 	@echo "### Building chartables..."
 	rm -f chartable.c
-	./mkchartable \
+	perl ./mkchartable.pl \
 	 -m $(srcdir)/charset/unifix.txt	\
 	 -m $(srcdir)/charset/unidata2.txt \
 	 $(srcdir)/charset/*.t \
 	 > chartable.c \
 	 || (rm -f chartable.c && exit 1)
 	@echo "### Done building chartables."
-#	./mkchartable -m $(srcdir)/charset/unicode.map $(srcdir)/charset/*.t >x-chartables.h
-#	mv x-chartables.h chartables.h
-
-mkchartable: mkchartable.o xstrlcpy.o xstrlcat.o xmalloc.o assert.o
-	$(CC) $(LDFLAGS) -o mkchartable mkchartable.o xstrlcpy.o xstrlcat.o xmalloc.o assert.o
 
 clean:
-	rm -f *.o *.a chartable.c Makefile.bak mkchartable makedepend.log \
+	rm -f *.o *.a chartable.c Makefile.bak makedepend.log \
 	$(BUILTSOURCES)
 
 distclean: clean
diff --git a/lib/charset.c b/lib/charset.c
index a83a625..65fda3d 100644
--- a/lib/charset.c
+++ b/lib/charset.c
@@ -53,115 +53,72 @@
 #include "chartable.h"
 #include "util.h"
 
-extern const unsigned char chartables_long_translations[];
-extern const int charset_max_translation;
-extern const unsigned char chartables_unicode_block[256];
-extern const unsigned char chartables_unicode[][256][4];
-extern const unsigned char chartables_us_ascii[][256][4];
+/* unicode canon translations */
+extern const int chartables_translation_multichar[];
+extern const unsigned char chartables_translation_block16[256];
+extern const unsigned char chartables_translation_block8[][256];
+extern const int chartables_translation[][256];
+
+/* named character sets */
 extern const struct charset chartables_charset_table[];
 extern const int chartables_num_charsets;
 
-struct decode_state {
-    const unsigned char (*curtable)[256][4];
-    const unsigned char (*lasttable)[256][4];
-    const unsigned char (*initialtable)[256][4];
-    unsigned utfcode;
-    unsigned num_bits;
-    unsigned b64_value;
+struct qp_state {
+    int isheader;
+    int bytesleft;
+    int codepoint;
+};
+
+struct b64_state {
+    int bytesleft;
+    int codepoint;
+};
+
+struct table_state {
+    const struct charmap (*curtable)[256];
+    const struct charmap (*initialtable)[256];
+    int bytesleft;
+    int codepoint;
+    int mode;
+    int num_bits;
+};
+
+struct canon_state {
+    int spacemode;
+    int seenspace;
 };
-#define START(state,table) \
-    ((state).curtable = (state.initialtable) = (table)); \
-    ((state).lasttable = NULL); \
-    ((state).utfcode = 0); \
-    ((state).num_bits = 0); \
-    ((state).b64_value = 0);
-
-
-static int xlate(int index, char *to);
-static int writeutf8(unsigned utfcode, char *to);
-
-#define TRANSLATE(state,c,ptr,idx) \
-{ \
-    unsigned char _ch; \
-    const unsigned char *_translation = (state).curtable[0][(unsigned char)(c) & 0xff]; \
-    for (;;) { \
-	switch (_ch = *_translation++) { \
-	case JSR: \
-	    (state).lasttable = (state).curtable; \
-	    /* FALL THROUGH */ \
-	case JMP: \
-	    (state).curtable = ((state).initialtable + \
-	      (_translation[0]<<8) + (_translation[1])); \
-	    break; \
- \
-	case RET: \
-	    (state).curtable = (state).lasttable; \
-	    /* FALL THROUGH */ \
-	case END: \
-	    break; \
-\
-	case U7F: \
-	    (state).b64_value = 0; \
-	    (state).num_bits = 0; \
-	    (state).curtable = ((state).initialtable + 1); \
-	    /* FALL THROUGH */ \
-	case U7N: \
-	    (state).b64_value <<= 6; \
-	    (state).b64_value += index_64[(unsigned char)(c) & 0xff]; \
-	    (state).num_bits += 6; \
-	    if ((state).num_bits >= 16) { \
-		(state).num_bits -= 16; \
-		(state).utfcode = \
-		    ((state).b64_value >> (state).num_bits) & 0x7fff; \
-		idx += writeutf8((state).utfcode, ptr+idx); \
-	    } \
-	    break; \
-\
-	case U83: \
-	    (state).lasttable = (state).curtable; \
-	    (state).utfcode = (c & 0x0f) << 12; \
-	    (state).curtable = ((state).initialtable + 1); \
-	    break; \
-\
-	case U83_2: \
-	    (state).utfcode += (c & 0x3f) << 6; \
-	    (state).curtable = ((state).initialtable + 2); \
-	    break; \
-\
-	case U83_3: \
-	    (state).utfcode += (c & 0x03f); \
-	    (state).curtable = (state).initialtable; \
-	    idx += writeutf8((state).utfcode, ptr+idx); \
-	    break; \
- \
-	case XLT: \
-	    idx += xlate((_translation[0]<<8) + (_translation[1]), ptr+idx); \
-	    _translation += 2; /* next translation is a RET or END */ \
-	    continue; \
- \
-	default: \
-	    (ptr)[(idx)++] = _ch; \
-	    continue; \
-	} \
-	break; \
-    } \
-}
 
-/* for a comp_pat, ascii[0x80] == 0 if there are any non-ascii characters
-   in the pattern */
 struct comp_pat_s {
-    int pat[256];		/* boyer-moore skip table */
-    int ascii[256];		/* case-mapped version of table */
-    int patlen;
-    int patlastchar;		/* last character in the pattern */
-    int patotherlastchar;	/* case-flip of the last character */
+    int max_start;
+    size_t patlen;
 };
 
-#define PATASCII(pat) (pat+256)
-#define PATLEN(pat) ((pat)[512])
-#define PATLASTCHAR(pat) ((pat)[513]) /* last character in the pattern */
-#define PATOTHERLASTCHAR(pat) ((pat)[514]) /* case-flip of the pattern */
-#define PATSIZE 515
+struct search_state {
+    size_t *starts;
+    int max_start;
+    int havematch;
+    unsigned char *substr;
+    size_t patlen;
+    size_t offset;
+};
+
+struct buffer_state {
+    unsigned char *base;
+    size_t offset;
+    size_t alloced;
+};
+
+struct convert_rock;
+
+typedef void convertproc_t(struct convert_rock *rock, int c);
+typedef void freeconvert_t(struct convert_rock *rock);
+
+struct convert_rock {
+    convertproc_t *f;
+    freeconvert_t *cleanup;
+    struct convert_rock *next;
+    void *state;
+};
 
 #define GROWSIZE 100
 
@@ -212,7 +169,557 @@ static const char index_64[256] = {
 };
 #define CHAR64(c)  (index_64[(unsigned char)(c)])
 
-#define USASCII(c) (chartables_us_ascii[0][(unsigned char)(c)][0])
+static inline void convert_putc(struct convert_rock *rock, int c)
+{
+    rock->f(rock, c);
+}
+
+void convert_cat(struct convert_rock *rock, const char *s)
+{
+    while (*s) {
+	convert_putc(rock, (unsigned char)*s);
+	s++;
+    }
+}
+
+void convert_catn(struct convert_rock *rock, const char *s, size_t len)
+{
+    while (len-- > 0) {
+	convert_putc(rock, (unsigned char)*s);
+	s++;
+    }
+}
+
+/* convertproc_t conversion functions */
+
+void qp2byte(struct convert_rock *rock, int c) 
+{
+    struct qp_state *s = (struct qp_state *)rock->state;
+    int val;
+
+    if (s->bytesleft) {
+	s->bytesleft--;
+	val = HEXCHAR(c);
+	if (val == XX) {
+	    /* mark invalid regardless */
+	    s->codepoint = -1;
+	    return;
+	}
+	if (s->codepoint != -1) {
+	    /* don't blat the invalid marker, but still absorb
+	     * the second char */
+	    s->codepoint = (s->codepoint << 4) + val;
+	}
+	if (!s->bytesleft) {
+	    convert_putc(rock->next, s->codepoint & 0xff);
+	}
+	return;
+    }
+
+    /* start an encoded byte */
+    if (c == '=') {
+	s->bytesleft = 2;
+	s->codepoint = 0;
+	return;
+    }
+
+    /* underscores are space in headers */
+    if (s->isheader && c == '_') c = ' ';
+
+    convert_putc(rock->next, c);
+}
+
+void b64_2byte(struct convert_rock *rock, int c) 
+{
+    struct b64_state *s = (struct b64_state *)rock->state;
+    char b = CHAR64(c);
+
+    /* could just be whitespace, ignore it */
+    if (b == XX) return;
+
+    switch (s->bytesleft) {
+    case 0:
+        s->codepoint = b;
+	s->bytesleft = 3;
+	break;
+    case 3:
+	convert_putc(rock->next, ((s->codepoint << 2) | (b >> 4)) & 0xff);
+	s->codepoint = b;
+	s->bytesleft = 2;
+	break;
+    case 2:
+	convert_putc(rock->next, ((s->codepoint << 4) | (b >> 2)) & 0xff);
+	s->codepoint = b;
+	s->bytesleft = 1;
+	break;
+    case 1:
+	convert_putc(rock->next, ((s->codepoint << 6) | b) & 0xff);
+	s->codepoint = 0;
+	s->bytesleft = 0;
+    }
+}
+
+void table2uni(struct convert_rock *rock, int c)
+{
+    struct table_state *s = (struct table_state *)rock->state;
+    struct charmap *map = (struct charmap *)&s->curtable[0][c & 0xff];
+
+    if (c == -1) { /* invalid character propogation */
+	convert_putc(rock->next, 0xfffd);
+	return;
+    }
+
+    if (map->c) {
+	convert_putc(rock->next, map->c);
+    }
+
+    s->curtable = s->initialtable + map->next;
+}
+
+void utf8_2uni(struct convert_rock *rock, int c)
+{
+    struct table_state *s = (struct table_state *)rock->state;
+
+    if (c == -1) { /* invalid character propogation */
+	convert_putc(rock->next, 0xfffd);
+	return;
+    }
+
+    if ((c & 0xf8) == 0xf0) { /* 11110xxx */
+	/* first of a 4 char sequence */
+	s->bytesleft = 3;
+	s->codepoint = c & 0x07; /* 00000111 */
+    }
+    else if ((c & 0xf0) == 0xe0) { /* 1110xxxx */
+	/* first of a 3 char sequence */
+	s->bytesleft = 2;
+	s->codepoint = c & 0x0f; /* 00001111 */
+    }
+    else if ((c & 0xe0) == 0xc0) { /* 110xxxxx */
+	/* first of a 2 char sequence */
+	s->bytesleft = 1;
+	s->codepoint = c & 0x1f; /* 00011111 */
+    }
+    else if ((c & 0xc0) == 0x80) { /* 10xxxxxx */
+	/* continuation char, handle only if expected */
+	if (s->bytesleft > 0) {
+	    s->codepoint = (s->codepoint << 6) + (c & 0x3f); /* 00111111 */
+	    s->bytesleft--;
+	    if (!s->bytesleft) {
+	        convert_putc(rock->next, s->codepoint);
+		s->codepoint = 0;
+	    }
+	}
+    }
+    else { /* plain ASCII char */
+	convert_putc(rock->next, c);
+	s->bytesleft = 0;
+	s->codepoint = 0;
+    }
+}
+
+void utf7_2uni (struct convert_rock *rock, int c)
+{
+    struct table_state *s = (struct table_state *)rock->state;
+
+    if (c == -1) { /* invalid character propogation */
+	convert_putc(rock->next, 0xfffd);
+	return;
+    }
+
+    if (c & 0x80) { /* skip 8-bit chars */
+	convert_putc(rock->next, -1);
+	return;
+    }
+
+    /* Inside a base64 encoded unicode fragment */
+    if (s->mode) {
+	/* '-' marks the end of a fragment */
+	if (c == '-') {
+	    /* special case: sequence +- creates output '+' */
+	    if (s->mode == 1)
+		convert_putc(rock->next, '+');
+	    /* otherwise no output for the '-' */
+	    s->mode = 0;
+	    s->num_bits = 0;
+	    s->codepoint = 0;
+	}
+	/* a normal char drops us out of base64 mode */
+	else if (CHAR64(c) == XX) {
+	    /* pass on the char */
+	    convert_putc(rock->next, c);
+	    /* and switch back to ASCII mode */
+	    s->mode = 0;
+	    /* XXX: warn if num_bits > 4 or codepoint != 0 */
+	    s->num_bits = 0;
+	    s->codepoint = 0;
+	}
+	/* base64 char - process it into the state machine */
+	else {
+	    s->mode = 2; /* we have some content, so don't process special +- */
+	    /* add the 6 bits of value from this character */
+	    s->codepoint = (s->codepoint << 6) + CHAR64(c);
+	    s->num_bits += 6;
+	    /* if we've got a full character's worth of bits, send it down 
+	     * the line and keep the remainder for the next character */
+	    if (s->num_bits >= 16) {
+		s->num_bits -= 16;
+		convert_putc(rock->next, (s->codepoint >> s->num_bits) & 0x7fff);
+		s->codepoint &= ((1 << s->num_bits) - 1); /* avoid overflow by trimming */
+	    }
+	}
+    }
+
+    /* regular ASCII mode */
+    else {
+	/* '+' switches to base64 unicode mode */
+	if (c == '+') {
+	    s->mode = 1; /* switch mode, but no content processed yet */
+	    s->codepoint = 0;
+	    s->num_bits = 0;
+	}
+	/* regular ASCII char */
+	else {
+	    convert_putc(rock->next, c);
+	}
+    }
+}
+
+void uni2searchform(struct convert_rock *rock, int c)
+{
+    struct canon_state *s = (struct canon_state *)rock->state;
+    int i;
+    int code;
+    unsigned char table16, table8;
+
+    table16 = chartables_translation_block16[(c>>16) & 0xff];
+
+    /* no translations */
+    if (table16 == 255) {
+	convert_putc(rock->next, c);
+	return;
+    }
+
+    table8 = chartables_translation_block8[table16][(c>>8) & 0xff];
+
+    /* no translations */
+    if (table8 == 255) {
+	convert_putc(rock->next, c);
+	return;
+    }
+
+    /* use the xlate table */
+    code = chartables_translation[table8][c & 0xff];
+
+    /* case - zero length output */
+    if (code == 0) {
+	return;
+    }
+
+    /* special case: whitespace */
+    if (code == ' ') {
+	switch (s->spacemode) {
+	case 0:
+	    return;
+
+	case 1:
+	    if (s->seenspace)
+		return;
+	    s->seenspace = 1;
+	    break;
+	/* XXX - anything other than compress or strip? */
+	}
+    }
+    else
+	s->seenspace = 0;
+
+    /* case - one character output */
+    if (code > 0) {
+	convert_putc(rock->next, code);
+	return;
+    }
+
+    /* case - multiple characters */
+    for (i = -code; chartables_translation_multichar[i]; i++) {
+	/* note: whitespace already stripped from multichar sequences... */
+	convert_putc(rock->next, chartables_translation_multichar[i]);
+    }
+}
+
+void uni2utf8(struct convert_rock *rock, int c)
+{
+    if (c > 0xffff) {
+	convert_putc(rock->next, 0xF0 + ((c >> 18) & 0x07));
+	convert_putc(rock->next, 0x80 + ((c >> 12) & 0x3f));
+	convert_putc(rock->next, 0x80 + ((c >>  6) & 0x3f));
+	convert_putc(rock->next, 0x80 + ( c        & 0x3f));
+    }
+    else if (c > 0x7ff) {
+	convert_putc(rock->next, 0xE0 + ((c >> 12) & 0x0f));
+	convert_putc(rock->next, 0x80 + ((c >>  6) & 0x3f));
+	convert_putc(rock->next, 0x80 + ( c        & 0x3f));
+    }
+    else if (c > 0x7f) {
+	convert_putc(rock->next, 0xC0 + ((c >>  6) & 0x1f));
+	convert_putc(rock->next, 0x80 + ( c        & 0x3f));
+    }
+    else {
+	convert_putc(rock->next, c);
+    }
+}
+
+void byte2search(struct convert_rock *rock, int c)
+{
+    struct search_state *s = (struct search_state *)rock->state;
+    int i, cur;
+    unsigned char b = (unsigned char)c;
+
+    /* check our "in_progress" matches to see if they're still valid */
+    for (i = 0, cur = 0; i < s->max_start; i++) {
+	/* no more active offsets */
+	if (s->starts[i] == -1) 
+	    break;
+
+	/* if we've passed one that's not ongoing, copy back */
+	if (cur < i) {
+	    s->starts[cur] = s->starts[i];
+	}
+	/* check that the substring is still maching */
+	if (b == s->substr[s->offset - s->starts[i]]) {
+	    if (s->offset - s->starts[i] == s->patlen - 1) {
+		/* we're there! */
+		s->havematch = 1;
+	    }
+	    else {
+		/* keep this one, it's ongoing */
+	    	cur++;
+	    }
+	}
+    }
+    /* starting a new one! */
+    if (b == s->substr[0]) {
+	/* have to treat this one specially! */
+	if (s->patlen == 1)
+	    s->havematch = 1;
+	else 
+	    s->starts[cur++] = s->offset;
+    }
+    /* empty out any others that aren't being kept */
+    while (cur < i) s->starts[cur++] = -1;
+
+    /* increment the offset counter */
+    s->offset++;
+}
+
+void byte2buffer(struct convert_rock *rock, int c)
+{
+    struct buffer_state *buf = (struct buffer_state *)rock->state;
+
+    /* make sure we have the space */
+    if (buf->offset >= buf->alloced) {
+	buf->alloced += GROWSIZE;
+	buf->base = realloc(buf->base, buf->alloced);
+    }
+
+    buf->base[buf->offset++] = c & 0xff;
+}
+
+/* convert_rock manipulation routines */
+
+void table_switch(struct convert_rock *rock, int charset_num)
+{
+    struct table_state *state = (struct table_state *)rock->state;
+
+    /* wipe any current state */
+    memset(state, 0, sizeof(struct table_state)); 
+
+    /* it's a table based lookup */
+    if (chartables_charset_table[charset_num].table) {
+	/* set up the initial table */
+	state->curtable = state->initialtable
+	    = chartables_charset_table[charset_num].table;
+	rock->f = table2uni;
+    }
+
+    /* special case UTF-8 */
+    else if (strstr(chartables_charset_table[charset_num].name, "utf-8")) {
+	rock->f = utf8_2uni;
+    }
+
+    /* special case UTF-7 */
+    else if (strstr(chartables_charset_table[charset_num].name, "utf-7")) {
+	rock->f = utf7_2uni;
+    }
+
+    /* should never happen */
+    else {
+	exit(1);
+	/* do something fatal here! */
+    }
+}
+
+/* Extract a cstring from a buffer.  NOTE: caller must free the memory
+ * themselves once this is called.  Resets the state.  If you don't
+ * call this function then buffer_free will clean up */
+char *buffer_cstring(struct convert_rock *rock)
+{
+    struct buffer_state *buf = (struct buffer_state *)rock->state;
+    char *res;
+
+    /* finish the string */
+    if (buf->offset >= buf->alloced) {
+	buf->alloced++;
+	buf->base = realloc(buf->base, buf->alloced);
+    }
+    buf->base[buf->offset] = '\0';
+
+    /* copy the pointer out */
+    res = buf->base;
+
+    /* clean up the buffer so it frees correctly later */
+    buf->base = 0;
+    buf->alloced = 0;
+    buf->offset = 0;
+
+    return res;
+}
+
+static inline int search_havematch(struct convert_rock *rock)
+{
+    struct search_state *s = (struct search_state *)rock->state;
+    return s->havematch;
+}
+
+/* conversion cleanup routines */
+
+void basic_free(struct convert_rock *rock) 
+{
+    if (rock) {
+	if (rock->state) free(rock->state);
+	free(rock);
+    }
+}
+
+void search_free(struct convert_rock *rock)
+{
+    if (rock && rock->state) {
+	struct search_state *s = (struct search_state *)rock->state;
+	if (s->starts) free(s->starts);
+    }
+    basic_free(rock);
+}
+
+void buffer_free(struct convert_rock *rock) {
+    if (rock && rock->state) {
+	struct buffer_state *buf = (struct buffer_state *)rock->state;
+	if (buf->base) free(buf->base);
+    }
+    basic_free(rock);
+}
+
+void convert_free(struct convert_rock *rock) {
+    struct convert_rock *next;
+    while (rock) {
+	next = rock->next;
+	if (rock->cleanup)
+	    rock->cleanup(rock);
+	else 
+	    basic_free(rock);
+	rock = next;
+    }
+}
+
+/* converter initialisation routines */
+
+struct convert_rock *qp_init(int isheader, struct convert_rock *next) 
+{
+    struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
+    struct qp_state *s = xzmalloc(sizeof(struct qp_state));
+    s->isheader = isheader;
+    rock->state = (void *)s;
+    rock->f = qp2byte;
+    rock->next = next;
+    return rock;
+}
+
+struct convert_rock *b64_init(struct convert_rock *next) 
+{
+    struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
+    rock->state = xzmalloc(sizeof(struct b64_state));
+    rock->f = b64_2byte;
+    rock->next = next;
+    return rock;
+}
+
+struct convert_rock *canon_init(int spacemode, struct convert_rock *next)
+{
+    struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
+    struct canon_state *s = xzmalloc(sizeof(struct canon_state));
+    s->spacemode = spacemode;
+    rock->f = uni2searchform;
+    rock->state = s;
+    rock->next = next;
+    return rock;
+}
+
+struct convert_rock *uni_init(struct convert_rock *next) 
+{
+    struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
+    rock->f = uni2utf8;
+    rock->next = next;
+    return rock;
+}
+
+struct convert_rock *table_init(int charset_num, struct convert_rock *next)
+{
+    struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
+    rock->state = xzmalloc(sizeof(struct table_state));
+    rock->next = next;
+    table_switch(rock, charset_num);
+    return rock;
+}
+
+struct convert_rock *search_init(const char *substr, comp_pat *pat) {
+    struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
+    struct search_state *s = xzmalloc(sizeof(struct search_state));
+    struct comp_pat_s *p = (struct comp_pat_s *)pat;
+    int i;
+
+    /* copy in tracking vars */
+    s->max_start = p->max_start;
+    s->patlen = p->patlen;
+    s->substr = (unsigned char *)substr;
+
+    /* allocate tracking space and initialise to "no match" */
+    s->starts = xmalloc(s->max_start * sizeof(int));
+    for (i = 0; i < s->max_start; i++) {
+	s->starts[i] = -1;
+    }
+
+    /* set up the rock */
+    rock->f = byte2search;
+    rock->cleanup = search_free;
+    rock->state = (void *)s;
+
+    return rock;
+}
+
+struct convert_rock *buffer_init(char *str, int len)
+{
+    struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
+    struct buffer_state *buf = xzmalloc(sizeof(struct buffer_state));
+
+    buf->base = str;
+    buf->alloced = len;
+
+    rock->f = byte2buffer;
+    rock->cleanup = buffer_free;
+    rock->state = (void *)buf;
+
+    return rock;
+}
+
+/* API */
 
 /*
  * Lookup the character set 'name'.  Returns the character set number
@@ -222,9 +729,11 @@ int charset_lookupname(const char *name)
 {
     int i;
 
-    for (i=0; i<chartables_num_charsets; i++) {
-	if (!strcasecmp(name, chartables_charset_table[i].name)) return i;
+    for (i = 0; i < chartables_num_charsets; i++) {
+	if (!strcasecmp(name, chartables_charset_table[i].name)) 
+	    return i;
     }
+
     return -1;
 }
 
@@ -233,55 +742,98 @@ int charset_lookupname(const char *name)
  * into canonical searching form.  Decodes into 'retval', which 
  * must be reallocable and currently at least size 'alloced'.
  */
-char *charset_convert(const char *s, int charset, char *retval,
-    int alloced)
+char *charset_convert(const char *s, int charset, char *buf, size_t bufsz)
 {
-    int pos = 0;
-    struct decode_state state;
+    struct convert_rock *input, *tobuffer;
+    char *res;
 
     if (!s) return 0;
 
-    if (charset < 0 || charset >= chartables_num_charsets) return xstrdup(EMPTY_STRING);
+    if (charset < 0 || charset >= chartables_num_charsets) 
+	return 0;
 
-    START(state,chartables_charset_table[charset].table);
-    
-    if (!alloced) {
-	alloced = GROWSIZE;
-	retval = xmalloc(alloced);
-    }
-    *retval = '\0';
+    /* set up the conversion path */
+    tobuffer = buffer_init(buf, bufsz);
+    input = uni_init(tobuffer);
+    input = canon_init(1, input);
+    input = table_init(charset, input);
 
-    while (*s) {
-	if (pos + charset_max_translation >= alloced) {
-	    alloced += GROWSIZE;
-	    retval = xrealloc(retval, alloced);
-	}
-	TRANSLATE(state, *s, retval, pos);
-	s++;
+    /* do the conversion */
+    convert_cat(input, s);
+
+    /* extract the result */
+    res = buffer_cstring(tobuffer);
+
+    /* clean up */
+    convert_free(input);
+
+    return res;
+}
+
+/* Convert from a given charset and encoding into utf8 */
+char *charset_to_utf8(const char *msg_base, size_t len, int charset, int encoding)
+{
+    struct convert_rock *input, *tobuffer;
+    char *res;
+
+    /* Initialize character set mapping */
+    if (charset < 0 || charset >= chartables_num_charsets) 
+	return 0;
+
+    /* check for trivial search */
+    if (len == 0)
+	return xstrdup("");
+
+    /* set up the conversion path */
+    tobuffer = buffer_init(0, 0);
+    input = uni_init(tobuffer);
+    input = table_init(charset, input);
+
+    /* choose encoding extraction if needed */
+    switch (encoding) {
+    case ENCODING_NONE:
+	break;
+
+    case ENCODING_QP:
+	input = qp_init(0, input);
+	break;
+
+    case ENCODING_BASE64:
+	input = b64_init(input);
+	/* XXX have to have nl-mapping base64 in order to
+	 * properly count \n as 2 raw characters
+	 */
+	break;
+
+    default:
+	/* Don't know encoding--nothing can match */
+	convert_free(input);
+	return 0;
     }
 
-    retval[pos] = '\0';
-    return retval;
+    convert_catn(input, msg_base, len);
+    res = buffer_cstring(tobuffer);
+    convert_free(input);
+
+    return res;
 }
 
-/*
- * Decode MIME strings (per RFC 2047) in 's'.  It writes the decoded
- * string to 'retval', calling realloc() as needed. (Thus retval may
- * be NULL.) Returns retval, contining 's' in canonical searching form.
- */
-char *charset_decode_mimeheader(const char *s, char *retval, int alloced)
+void mimeheader_cat(struct convert_rock *target, const char *s)
 {
+    struct convert_rock *input;
     int eatspace = 0;
     const char *start, *endcharset, *encoding, *end;
     const char *p;
-    int i, c, c1, c2, c3, c4;
-    struct decode_state state;
+    int i, c, c1, c2, c3, c4, charset;
     int pos = 0;
     int len;
+    char *res;
 
     if (!s) return 0;
 
-    START(state,chartables_charset_table[0].table);    /* just for msvc lint */
+    /* set up the conversion path */
+    input = table_init(0, target);
+
     start = s;
     while ((start = (const char*) strchr(start, '=')) != 0) {
 	start++;
@@ -308,88 +860,40 @@ char *charset_decode_mimeheader(const char *s, char *retval, int alloced)
 	}
 	if (!eatspace) {
 	    len = start - s - 1;
-	    if (pos + len >= alloced) {
-		alloced += len + GROWSIZE;
-		retval = xrealloc(retval, alloced);
-	    }
-	    while (len--) {
-		c = USASCII(*s);
-		if (c != END) retval[pos++] = (char)c;
-		s++;
-	    }
+	    table_switch(input, 0); /* US_ASCII */
+	    convert_catn(input, s, len);
 	}
 
 	/*
 	 * Get the 1522-word's character set
 	 */
 	start++;
-	for (i=0; i<chartables_num_charsets; i++) {
-	    if ((int)strlen(chartables_charset_table[i].name) == endcharset-start &&
-		!strncasecmp(start, chartables_charset_table[i].name, endcharset-start)) {
-		START(state,chartables_charset_table[i].table);
+	for (charset = 0; charset < chartables_num_charsets; charset++) {
+	    if ((int)strlen(chartables_charset_table[charset].name) == endcharset-start &&
+		!strncasecmp(start, chartables_charset_table[charset].name, endcharset-start)) {
+		table_switch(input, charset);
 		break;
 	    }
 	}
 
-	if (i == chartables_num_charsets) {
+	if (charset == chartables_num_charsets) {
 	    /* Unrecognized charset, nothing will match here */
-	    if (pos + 2 >= alloced) {
-		alloced += 2 + GROWSIZE;
-		retval = xrealloc(retval, alloced);
-	    }
-	    strcpy(retval+pos, EMPTY_STRING);
-	    pos += 1;
-	}
-	else if (encoding[1] == 'q' || encoding[1] == 'Q') {
-	    /* Decode 'Q' encoding */
-	    p = encoding+3;
-	    while (p < end) {
-		c = *p++;
-		if (c == '=') {
-		    c = HEXCHAR(*p);
-		    p++;
-		    i = HEXCHAR(*p);
-		    p++;
-		    if (c == XX || i == XX) {
-			c = '\0';
-		    }
-		    else {
-			c = (char)((c << 4) + i);
-		    }
-		}
-		else if (c == '_') c = ' ';
-
-		if (pos + charset_max_translation >= alloced) {
-		    alloced += GROWSIZE;
-		    retval = xrealloc(retval, alloced);
-		}
-		TRANSLATE(state, c, retval, pos);
-	    }
+	    convert_putc(input, -1); /* unknown character */
 	}
 	else {
-	    /* Decode 'B' encoding */
-	    p = encoding+3;
-	    while (p < end) {
-		if (pos + charset_max_translation*3 >= alloced) {
-		    alloced += GROWSIZE;
-		    retval = xrealloc(retval, alloced);
-		}
-		c1 = CHAR64(p[0]);
-		if (c1 == XX) break;
-		c2 = CHAR64(p[1]);
-		if (c2 == XX) break;
-		TRANSLATE(state,((c1<<2) | ((c2&0x30)>>4)), retval, pos);
-
-		c3 = CHAR64(p[2]);
-		if (c3 == XX) break;
-		TRANSLATE(state,(((c2&0XF)<<4) | ((c3&0x3C)>>2)), retval, pos);
-
-		c4 = CHAR64(p[3]);
-		if (c4 == XX) break;
-		TRANSLATE(state,(((c3&0x03) <<6) | c4), retval, pos);
-
-		p += 4;
+	    struct convert_rock *extract;
+	    /* choose decoder */
+	    if (encoding[1] == 'q' || encoding[1] == 'Q') {
+		extract = qp_init(1, input);
 	    }
+	    else {
+		extract = b64_init(input);
+	    }
+	    /* convert */
+	    p = encoding+3;
+	    convert_catn(extract, p, end - p);
+	    /* clean up */
+	    basic_free(extract);
 	}
 
 	/* Prepare for the next iteration */
@@ -398,164 +902,136 @@ char *charset_decode_mimeheader(const char *s, char *retval, int alloced)
     }
 
     /* Copy over the tail part of the input string */
-    len = strlen(s);
-    if (pos + len >= alloced) {
-	alloced += len + 1;
-	retval = xrealloc(retval, alloced);
-    }
-    while (len--) {
-	c = USASCII(*s);
-	if (c != END) retval[pos++] = (char)c;
-	s++;
+    if (*s) {
+	table_switch(input, 0); /* US_ASCII */
+	convert_cat(input, s);
     }
-    retval[pos] = '\0';
-    return retval;
-}
 
-/*
- * Compile the pattern 's' and return a pointer to the compiled form
- */
-comp_pat *charset_compilepat(const char *s)
-{
-    comp_pat *pat;
-    int i, c, len;
-
-    pat = (comp_pat *)xmalloc(PATSIZE * sizeof(comp_pat));
-    PATLEN(pat) = len = strlen(s);
-    if (len) {
-	PATLASTCHAR(pat) = c = (unsigned char)s[len-1];
-	if (isupper(c)) PATOTHERLASTCHAR(pat) = TOLOWER(c);
-	else if (islower(c)) PATOTHERLASTCHAR(pat) = TOUPPER(c);
-	else PATOTHERLASTCHAR(pat) = c;
-    }
-    for (i=0; i<512; i++) pat[i] = len;
-    for (i=0; i<len; i++) {
-	c = (unsigned char)s[i];
-	PATASCII(pat)[c] = pat[c] = len-i-1;
-	if (c & 0x80) PATASCII(pat)[0x80] = 0;
-    }
-    for (i='A'; i<='Z'; i++) {
-	PATASCII(pat)[i] = PATASCII(pat)[i-'A'+'a'];
-    }
-    return pat;
+    /* just free this one, the rest can be cleaned up by the sender */
+    basic_free(input);
 }
 
 /*
- * Free the compiled pattern 'pat'
+ * Decode MIME strings (per RFC 2047) in 's'.  It writes the decoded
+ * string to 'retval', calling realloc() as needed. (Thus retval may
+ * be NULL.) Returns retval, contining 's' in canonical searching form.
  */
-void charset_freepat(comp_pat *pat)
+char *charset_decode_mimeheader(const char *s, char *retval, size_t alloced)
 {
-    free((char *)pat);
+    struct convert_rock *tobuffer, *input;
+    char *res;
+
+    tobuffer = buffer_init(retval, alloced);
+    input = uni_init(tobuffer);
+    input = canon_init(1, input);
+
+    mimeheader_cat(input, s);
+ 
+    res = buffer_cstring(tobuffer);
+
+    convert_free(input);
+
+    return res;
 }
 
 /*
- * Search for the string 'substr', with compiled pattern 'pat'
- * in the string 's', with length 'len'.  Return nonzero if match
+ * Decode MIME strings (per RFC 2047) in 's'.  It writes the decoded
+ * string to 'retval', calling realloc() as needed. (Thus retval may
+ * be NULL.) Returns retval, contining 's' in canonical searching form.
  */
-int charset_searchstring(const char *substr, comp_pat *pat,
-    const char *s, int len)
+char *charset_parse_mimeheader(const char *s)
 {
-    int i, j, large;
-    
-    assert(pat != NULL);
-    i = PATLEN(pat) - 1;
-    if (i < 0) return 1;
-    pat[PATLASTCHAR(pat)] = large = len + i + 2;
-    for (;;) {
-	/* Inner loop -- scan until last char match or end of string */
-	while (i < len) {
-	    i += pat[(unsigned char)s[i]];
-	}
+    struct convert_rock *tobuffer, *input;
+    char *res;
 
-	/* End of string */
-	if (i < large) return 0;
+    tobuffer = buffer_init(0, 0);
+    input = uni_init(tobuffer);
 
-	/* Last char match--back up and do compare */
-	i -= large + 1;
-	j = PATLEN(pat) - 2;
-	while (j >= 0 && s[i] == substr[j]) {
-	    i--;
-	    j--;
-	}
-	if (j < 0) return 1;	/* Found match */
-	if (pat[(unsigned char)s[i]] == large ||
-	    pat[(unsigned char)s[i]] < PATLEN(pat)-j) {
-	    i += PATLEN(pat) - j;
-	}
-	else {
-	    i += pat[(unsigned char)s[i]];
-	}
-    }
-}    
+    mimeheader_cat(input, s);
+ 
+    res = buffer_cstring(tobuffer);
 
-static int xlate(int index, char *to) {
-    const unsigned char *from = chartables_long_translations + index;
-    int i = 0;
+    convert_free(input);
 
-    while ((*to++ = *from++) != END) i++;
-    return i;
+    return res;
 }
 
-static int writeutf8(unsigned utfcode, char *to)
+int charset_search_mimeheader(const char *substr, comp_pat *pat,
+    const char *s, int searchform)
 {
-    int table = chartables_unicode_block[utfcode>>8];
-    int idx = 0;
-    struct decode_state state;
-
-    if (table == 255) {
-	/* No translations in this block */
-	if (utfcode > 0x7ff) {
-	    to[0] = (char)(0xE0 + (utfcode >> 12));
-	    to[1] = (char)(0x80 + ((utfcode >> 6) & 0x3f));
-	    to[2] = (char)(0x80 + (utfcode & 0x3f));
-	    return 3;
-	}
-	if (utfcode > 0x7f) {
-	    to[0] = (char)(0xC0 + (utfcode >> 6));
-	    to[1] = (char)(0x80 + (utfcode & 0x3f));
-	    return 2;
-	}
-	to[0] = (char)utfcode;
-	return 1;
-    }
+    struct convert_rock *input, *tosearch;
+    int res;
 
-    START(state, chartables_unicode + table);
-    TRANSLATE(state, (utfcode & 0xff), to, idx);
+    tosearch = search_init(substr, pat);
+    input = uni_init(tosearch);
+    if (searchform) input = canon_init(1, input);
 
-    return idx;
+    mimeheader_cat(input, s);
+ 
+    res = search_havematch(tosearch);
 
+    convert_free(input);
+
+    return res;
+}
+
+/* Compile a search pattern for later comparison.  We just count
+ * how long the string is, and how many times the first character
+ * occurs.  Later optimisation could reduce the max_start by
+ * deeper analysis of the possible paths through the string, but
+ * this is a good absolute maximum, and it just means a few more
+ * bytes get allocated... */
+comp_pat *charset_compilepat(const char *s)
+{
+    struct comp_pat_s *pat = xzmalloc(sizeof(struct comp_pat_s));
+    const char *p = s;
+    /* count occurances */
+    while (*p) {
+	if (*p == *s) pat->max_start++; 
+	pat->patlen++;
+	p++;
+    }
+    return (comp_pat *)pat;
 }
 
 /*
- * The various charset_searchfile() helper functions
+ * Free the compiled pattern 'pat'
  */
-struct input_state;
-typedef int rawproc_t(struct input_state *state, char *buf, int size);
-
-static int charset_readconvert(struct input_state *state, char *buf, int size);
-static rawproc_t charset_readplain;
-static rawproc_t charset_readplain_nospc;
-static rawproc_t charset_readmapnl;
-static rawproc_t charset_readqp;
-static rawproc_t charset_readqp_nospc;
-static rawproc_t charset_readqpmapnl;
-static rawproc_t charset_readbase64;
-static rawproc_t charset_readbase64_nospc;
+void charset_freepat(comp_pat *pat)
+{
+    free((struct comp_pat_s *)pat);
+}
 
 /*
- * State for the various charset_searchfile() helper functions
+ * Search for the string 'substr', with compiled pattern 'pat'
+ * in the string 's', with length 'len'.  Return nonzero if match
+ *
+ * Uses the to_search target directly.  Assumes 's' is already
+ * in search normal form (i.e. from a cache file)
  */
-struct input_state {
-    rawproc_t *rawproc;	/* Function to read and transfer-decode data */
-    const char *rawbase;	/* Location in mapped file of raw data */
-    int rawlen;		/* # bytes raw data left to read from file */
-    char decodebuf[2048];	/* Buffer of data deocded, but not converted
-				 * into canonical searching form */
-    int decodestart, decodeleft; /* Location/count of decoded data */
-    struct decode_state decodestate; /* Charset state to convert decoded data
-				  * into canonical searching form */
-};
+int charset_searchstring(const char *substr, comp_pat *pat,
+    const char *s, size_t len)
+{
+    struct convert_rock *tosearch;
+    int res, *found;
+
+    /* set up the search handler */
+    tosearch = search_init(substr, pat);
+
+    /* feed the handler */
+    while (len-- > 0) {
+	convert_putc(tosearch, (unsigned char)*s++);
+	if (search_havematch(tosearch)) break; /* shortcut if there's a match */
+    }
 
+    /* copy the value */
+    res = search_havematch(tosearch);
+
+    /* clean up */
+    search_free(tosearch);
+
+    return res;
+}
 
 /*
  * Search for the string 'substr' in the next 'len' bytes of 
@@ -566,139 +1042,37 @@ struct input_state {
  * Returns nonzero iff the string was found.
  */
 int charset_searchfile(const char *substr, comp_pat *pat,
-    const char *msg_base, int mapnl, int len, int charset, int encoding)
+    const char *msg_base, int mapnl, size_t len, int charset, 
+    int encoding)
 {
-    int substrlen = PATLEN(pat);
-    char *buf, smallbuf[2048];
-    int bufsize;
-    int n;
-    int i, j, large;
-    struct input_state state;
-    
+    struct convert_rock *input, *tosearch;
+    int i, *found, res;
+
     /* Initialize character set mapping */
-    if (charset < 0 || charset >= chartables_num_charsets) return 0;
-    START(state.decodestate, chartables_charset_table[charset].table);
-    state.decodeleft = 0;
+    if (charset < 0 || charset >= chartables_num_charsets) 
+	return 0;
 
     /* check for trivial search */
-    if (substrlen == 0) return 1;
-
-    /*
-     * Select buffer to hold canonical searching fomat data to
-     * search
-     */
-    if (substrlen < (int) sizeof(smallbuf)/2) {
-	bufsize = sizeof(smallbuf);
-	buf = smallbuf;
-    }
-    else {
-	bufsize = substrlen+sizeof(smallbuf);
-	buf = xmalloc(bufsize);
-    }
-
-    /* Optimized searching of us-ascii, using boyer-moore */
-    if (charset == 0) {
-	/* Initialize transfer-decoding */
-	state.rawbase = msg_base;
-	state.rawlen = len;
-	/* don't need to special case mapnl since all such chars will
-	   be ignored, anyway */
-	switch (encoding) {
-	case ENCODING_NONE:
-	    state.rawproc = charset_readplain_nospc;
-	    break;
-	    
-	case ENCODING_QP:
-	    state.rawproc = charset_readqp_nospc;
-	    break;
-	    
-	case ENCODING_BASE64:
-	    state.rawproc = charset_readbase64_nospc;
-	    /* XXX have to have nl-mapping base64 in order to
-	     * properly count \n as 2 raw characters
-	     */
-	    break;
-	    
-	default:
-	    /* Don't know encoding--nothing can match */
-	    return 0;
-	}
-	
-	if (PATASCII(pat)[0x80] == 0) {
-	    /* 8-bit chars in pattern--search must fail */
-	    if (buf != smallbuf) free(buf);
-	    return 0;
-	}
-
-	n = (*state.rawproc)(&state, buf, bufsize);
-	if (n < substrlen) {
-	    if (buf != smallbuf) free(buf);
-	    return 0;
-	}
-	i = substrlen - 1;
-	PATASCII(pat)[PATLASTCHAR(pat)] =
-	  PATASCII(pat)[PATOTHERLASTCHAR(pat)] = large = bufsize + i + 2;
-
-	for (;;) {
-	    /* Inner loop -- scan until last char match or end of buffer */
-	    while (i < n) {
-		i += PATASCII(pat)[(unsigned char)buf[i]];
-	    }
-
-	    /* End of buffer */
-	    if (i < large) {
-		/* Read in more stuff */
-		j = i-n;
-		strncpy(buf, buf+i-(substrlen-1), substrlen-1-j);
-		n = (*state.rawproc)(&state, buf+substrlen-1-j, bufsize-substrlen+1+j);
-		i = substrlen-1;
-		if (n > 0) {
-		    n += i-j;
-		    continue;
-		}
-		if (buf != smallbuf) free(buf);
-		return 0;
-	    }
-
-	    /* Last char match--back up and do compare */
-	    i -= large + 1;
-	    j = PATLEN(pat) - 2;
-	    while (j >= 0 && TOLOWER(buf[i]) == TOLOWER(substr[j])) {
-		i--;
-		j--;
-	    }
-	    if (j < 0) {
-		/* Found match */
-		if (buf != smallbuf) free(buf);
-		return 1;
-	    }
-	    if (PATASCII(pat)[(unsigned char)buf[i]] == large ||
-		PATASCII(pat)[(unsigned char)buf[i]] < PATLEN(pat)-j) {
-		i += PATLEN(pat) - j;
-	    }
-	    else {
-		i += PATASCII(pat)[(unsigned char)buf[i]];
-	    }
-	}
-	/* NOTREACHED */
-    }
+    if (strlen(substr) == 0)
+	return 1;
 
-    /* Do the (generalized) search */
+    /* set up the conversion path */
+    tosearch = search_init(substr, pat);
+    input = uni_init(tosearch);
+    input = canon_init(1, input);
+    input = table_init(charset, input);
 
-    /* Initialize transfer-decoding */
-    state.rawbase = msg_base;
-    state.rawlen = len;
+    /* choose encoding extraction if needed */
     switch (encoding) {
     case ENCODING_NONE:
-	state.rawproc = mapnl ? charset_readmapnl : charset_readplain;
 	break;
 
     case ENCODING_QP:
-	state.rawproc = mapnl ? charset_readqpmapnl : charset_readqp;
+	input = qp_init(0, input);
 	break;
 
     case ENCODING_BASE64:
-	state.rawproc = charset_readbase64;
+	input = b64_init(input);
 	/* XXX have to have nl-mapping base64 in order to
 	 * properly count \n as 2 raw characters
 	 */
@@ -706,87 +1080,56 @@ int charset_searchfile(const char *substr, comp_pat *pat,
 
     default:
 	/* Don't know encoding--nothing can match */
+	convert_free(input);
 	return 0;
     }
 
-    n = charset_readconvert(&state, buf, bufsize);
-    if (n < substrlen) {
-	if (buf != smallbuf) free(buf);
-	return 0;
-    }
-    i = substrlen - 1;
-    pat[PATLASTCHAR(pat)] = large = bufsize + i + 2;
-    for (;;) {
-	/* Inner loop -- scan until last char match or end of buffer */
-	while (i < n) {
-	    i += pat[(unsigned char)buf[i]];
+    /* implement the loop here so we can check on the search each time */
+    for (i = 0; i < len; i++) {
+	if (mapnl && msg_base[i] == '\n') {
+	    convert_putc(input, '\r');
+	    len--;
 	}
+	convert_putc(input, msg_base[i]);
+	if (search_havematch(tosearch)) break;
+    }
 
-	/* End of buffer */
-	if (i < large) {
-	    /* Read in more stuff */
-	    j = i-n;
-	    strncpy(buf, buf+i-(substrlen-1), substrlen-1-j);
-	    n = charset_readconvert(&state, buf+substrlen-1-j,
-				    bufsize-substrlen+1+j);
-	    i = substrlen-1;
-	    if (n > 0) {
-		n += i-j;
-		continue;
-	    }
-	    if (buf != smallbuf) free(buf);
-	    return 0;
-	}
+    res = search_havematch(tosearch); /* copy before we free it */
 
-	/* Last char match--back up and do compare */
-	i -= large + 1;
-	j = PATLEN(pat) - 2;
-	while (j >= 0 && buf[i] == substr[j]) {
-	    i--;
-	    j--;
-	}
-	if (j < 0) {
-	    /* Found match */
-	    if (buf != smallbuf) free(buf);
-	    return 1;
-	}
-	if (pat[(unsigned char)buf[i]] == large ||
-	    pat[(unsigned char)buf[i]] < PATLEN(pat)-j) {
-	    i += PATLEN(pat) - j;
-	}
-	else {
-	    i += pat[(unsigned char)buf[i]];
-	}
-    }
+    convert_free(input);
+
+    return res;
 }
 
 /* This is based on charset_searchfile above. */
 int charset_extractfile(index_search_text_receiver_t receiver,
-    void* rock, int uid, const char *msg_base, int mapnl, int len, int charset,
-    int encoding) {
-    char buf[2048];
-    int n;
-    struct input_state state;
-    
+    void* rock, int uid, const char *msg_base, int mapnl, size_t len, 
+    int charset, int encoding)
+{
+    struct convert_rock *input, *tobuffer;
+    struct buffer_state *out;
+    int i;
+
+    /* set up the conversion path */
+    tobuffer = buffer_init(0, 0);
+    input = uni_init(tobuffer);
+    input = canon_init(1, input);
+    input = table_init(charset, input);
+
     /* Initialize character set mapping */
-    if (charset < 0 || charset >= chartables_num_charsets) return 0;
-    START(state.decodestate, chartables_charset_table[charset].table);
-    state.decodeleft = 0;
+    if (charset < 0 || charset >= chartables_num_charsets) 
+	return 0;
 
-    /* Initialize transfer-decoding */
-    state.rawbase = msg_base;
-    state.rawlen = len;
     switch (encoding) {
     case ENCODING_NONE:
-	state.rawproc = mapnl ? charset_readmapnl : charset_readplain;
 	break;
 
     case ENCODING_QP:
-	state.rawproc = mapnl ? charset_readqpmapnl : charset_readqp;
+	input = qp_init(0, input);
 	break;
 
     case ENCODING_BASE64:
-	state.rawproc = charset_readbase64;
+	input = b64_init(input);
 	/* XXX have to have nl-mapping base64 in order to
 	 * properly count \n as 2 raw characters
 	 */
@@ -794,50 +1137,33 @@ int charset_extractfile(index_search_text_receiver_t receiver,
 
     default:
 	/* Don't know encoding--nothing can match */
+	convert_free(input);
 	return 0;
     }
 
-    /* We don't need to do anything tricky. Just read and convert each block of the
-       text, then hand the converted text down to the receiver. */
-    do {
-      n = charset_readconvert(&state, buf, sizeof(buf));
-      if (n > 0) {
-        receiver(uid, SEARCHINDEX_PART_BODY,
-                 SEARCHINDEX_CMD_APPENDPART, buf, n, rock);
-      }
-    } while (n > 0);
+    /* point to the buffer for easy block sending */
+    out = (struct buffer_state *)tobuffer->state;
 
-    return 1;
-}
-
-/*
- * Helper function to read at most 'size' bytes of converted
- * (into canonical searching format) data into 'buf'.  Returns
- * the number of converted bytes, or 0 for end-of-data.
- */
-static int charset_readconvert(struct input_state *state, char *buf, int size)
-{
-    int retval = 0;
+    for (i = 0; i < len; i++) {
+	if (mapnl && msg_base[i] == '\n') {
+	    convert_putc(input, '\r');
+	    len--;
+	}
+	convert_putc(input, msg_base[i]);
 
-    if (state->decodeleft && state->decodestart != 0) {
-	memmove(state->decodebuf, state->decodebuf+state->decodestart,
-		state->decodeleft);
+	/* process a block of output every so often */
+	if (out->offset > 4096) {
+	    receiver(uid, SEARCHINDEX_PART_BODY, SEARCHINDEX_CMD_APPENDPART, out->base, out->offset, rock);
+	    out->offset = 0;
+	}
+    }
+    if (out->offset) { /* finish it */
+	receiver(uid, SEARCHINDEX_PART_BODY, SEARCHINDEX_CMD_APPENDPART, out->base, out->offset, rock);
     }
-    state->decodestart = 0;
 
-    state->decodeleft += (*state->rawproc)(state,
-					   state->decodebuf+state->decodeleft,
-					   sizeof(state->decodebuf)-state->decodeleft);
+    convert_free(input);
 
-    while (state->decodeleft) {
-	if (retval + charset_max_translation > size) {
-	    return retval;
-	}
-	TRANSLATE(state->decodestate, state->decodebuf[state->decodestart], buf, retval);
-	state->decodestart++;
-	state->decodeleft--;
-    }
-    return retval;
+    return 1;
 }
 
 /*
@@ -847,14 +1173,10 @@ static int charset_readconvert(struct input_state *state, char *buf, int size)
  * least size 'alloced'.  Returns the number of decoded bytes in
  * 'outlen'. 
  */
-char *charset_decode_mimebody(const char *msg_base, int len, int encoding,
-			      char **retval, int alloced, int *outlen)
+char *charset_decode_mimebody(const char *msg_base, size_t len, int encoding,
+			      char **retval, size_t alloced, size_t *outlen)
 {
-    struct input_state state;
-    
-    /* Initialize transfer-decoding */
-    state.rawbase = msg_base;
-    state.rawlen = len;
+    struct convert_rock *input, *tobuffer;
 
     switch (encoding) {
     case ENCODING_NONE:
@@ -862,11 +1184,13 @@ char *charset_decode_mimebody(const char *msg_base, int len, int encoding,
 	return (char *) msg_base;
 
     case ENCODING_QP:
-	state.rawproc = charset_readqp;
+	tobuffer = buffer_init(*retval, alloced);
+	input = qp_init(0, tobuffer);
 	break;
 
     case ENCODING_BASE64:
-	state.rawproc = charset_readbase64;
+	tobuffer = buffer_init(*retval, alloced);
+	input = b64_init(tobuffer);
 	break;
 
     default:
@@ -874,461 +1198,22 @@ char *charset_decode_mimebody(const char *msg_base, int len, int encoding,
 	return NULL;
     }
 
-    if (alloced < len+1) *retval = xrealloc(*retval, len+1);
-    *outlen = (*state.rawproc)(&state, *retval, len);
-    (*retval)[*outlen] = '\0';
-    return *retval;
-}
-
-/*
- * Helper function to read at most 'size' bytes of trivial
- * transfer-decoded data into 'buf'.  Returns the number of decoded
- * bytes, or 0 for end-of-data.
- */
-static int charset_readplain(struct input_state *state, char *buf, int size)
-{
-    if (size > state->rawlen) size = state->rawlen;
-    if (!size) return 0;
-
-    memcpy(buf, state->rawbase, size);
-    state->rawlen -= size;
-    state->rawbase += size;
-
-    return size;
-}
-
-/*
- * Helper function to read at most 'size' bytes of trivial
- * transfer-decoded data into 'buf'.  Removes any US-ASCII whitespace.
- * Returns the number of decoded bytes, or 0 for end-of-data.  
- */
-static int charset_readplain_nospc(struct input_state *state, 
-				   char *buf, int size)
-{
-    int i;
-
-    for (i = 0; i < size; i++) {
-	/* remove any whitespace at the beginning of rawbase */
-	while (state->rawlen > 0 && USASCII(*state->rawbase) == END) {
-	    state->rawlen--;
-	    state->rawbase++;
-	}
-
-	if (state->rawlen == 0) break;
-
-	/* copy a char */
-	buf[i] = *state->rawbase++;
-	state->rawlen--;
-    }
-
-    return i;
-}
-
-/*
- * Helper function to read at most 'size' bytes of trivial newline-mapped
- * transfer-decoded data into 'buf'.  Returns the number of decoded
- * bytes, or 0 for end-of-data.
- */
-static int charset_readmapnl(struct input_state *state, char *buf, int size)
-{
-    int retval = 0;
-    char c;
-
-    while (size && state->rawlen > 0) {
-	c = *state->rawbase;
-	if (c == '\n') {
-	    if (size < 2) {
-		return retval;
-	    }
-	    *buf++ = '\r';
-	    retval++;
-	    size--;
-	    state->rawlen--;
-	}
-	*buf++ = c;
-	state->rawbase++;
-	state->rawlen--;
-	retval++;
-	size--;
-    }
-    return retval;
-}
-
-/*
- * Helper function to read at most 'size' bytes of quoted-printable
- * transfer-decoded data into 'buf'.  Returns the number of decoded
- * bytes, or 0 for end-of-data.
- */
-static int charset_readqp(struct input_state *state, char *buf, int size)
-{
-    int retval = 0;
-    int c, c1, c2;
-    const char *nextline, *endline;
-
-    nextline = endline = state->rawbase;
-
-    while (size && state->rawlen) {
-	if (state->rawbase >= nextline) {
-	    /* Ignore trailing whitespace at end of line */
-
-	    nextline =
-		(const char*) memchr(state->rawbase+1, '\r', state->rawlen-1);
-	    if (!nextline) nextline = state->rawbase + state->rawlen;
-	    endline = nextline;
-	    while (endline > state->rawbase &&
-		   (endline[-1] == ' ' || endline[-1] == '\t')) {
-		endline--;
-	    }
-	}
-	if (state->rawbase >= endline) {
-	    state->rawbase += nextline - endline;
-	    state->rawlen -= nextline - endline;
-	    continue;
-	}
-
-	c = state->rawbase[0];
-	if (c == '=') {
-	    if (state->rawlen < 3) {
-		return retval;
-	    }
-	    c1 = state->rawbase[1];
-	    c2 = state->rawbase[2];
-	    state->rawbase += 3;
-	    state->rawlen -= 3;
-	    c1 = HEXCHAR(c1);
-	    c2 = HEXCHAR(c2);
-	    /* Following line also takes care of soft line breaks */
-	    if (c1 == XX && c2 == XX) continue;
-	    *buf++ = (char)((c1 << 4) + c2);
-	    retval++;
-	    size--;
-	}
-	else {
-	    state->rawbase++;
-	    state->rawlen--;
-	    *buf++ = (char)c;
-	    retval++;
-	    size--;
-	}
-    }
-    return retval;
-}
-
-/*
- * Helper function to read at most 'size' bytes of quoted-printable
- * transfer-decoded data into 'buf'.  Returns the number of decoded
- * bytes, or 0 for end-of-data.  Removes any US-ASCII whitespace.
- * Since it just throws out \r's anyway, it's simplier than paying
- * attention to them 
- */
-static int charset_readqp_nospc(struct input_state *state, char *buf, int size)
-{
-    int retval = 0;
-    int c, c1, c2;
-    char dec;
-    const char *nextline, *endline;
-
-    nextline = endline = state->rawbase;
-
-    while (size && state->rawlen) {
-	if (state->rawbase >= nextline) {
-	    /* Ignore trailing whitespace at end of line */
-
-	    nextline =
-		(const char*) memchr(state->rawbase+1, '\n', state->rawlen-1);
-	    if (!nextline) nextline = state->rawbase + state->rawlen;
-	    endline = nextline;
-	    while (endline > state->rawbase && (USASCII(endline[-1]) == END)) {
-		endline--;
-	    }
-	}
-	if (state->rawbase >= endline) {
-	    state->rawbase += nextline - endline;
-	    state->rawlen -= nextline - endline;
-	    continue;
-	}
-
-	c = state->rawbase[0];
-	if (c == '=') {
-	    if (state->rawlen < 3) {
-		return retval;
-	    }
-	    c1 = state->rawbase[1];
-	    c2 = state->rawbase[2];
-	    state->rawbase += 3;
-	    state->rawlen -= 3;
-	    c1 = HEXCHAR(c1);
-	    c2 = HEXCHAR(c2);
-	    /* Following line also takes care of soft line breaks */
-	    if (c1 == XX && c2 == XX) continue;
-	    dec = (char)((c1 << 4) + c2);
-	    if (USASCII(dec) != END) {
-		/* non-whitespace, take it */
-		*buf++ = (char)((c1 << 4) + c2);
-		retval++;
-		size--;
-	    }
-	}
-	else {
-	    state->rawbase++;
-	    state->rawlen--;
-	    if (USASCII(c) != END) {
-		/* non-whitespace, grab it */
-		*buf++ = (char)c;
-		retval++;
-		size--;
-	    }
-	}
-    }
-    return retval;
-}
-
-/*
- * Helper function to read at most 'size' bytes of QP newline-mapped
- * transfer-decoded data into 'buf'.  Returns the number of decoded
- * bytes, or 0 for end-of-data.
- */
-static int charset_readqpmapnl(struct input_state *state, char *buf, int size)
-{
-    int retval = 0;
-    int c, c1, c2;
-    const char *nextline, *endline;
-
-    nextline = endline = state->rawbase;
-
-    while (size && state->rawlen > 0) {
-	if (state->rawbase >= nextline) {
-	    /* Ignore trailing whitespace at end of line */
-
-	    nextline = (const char*)
-		memchr(state->rawbase+1, '\n', state->rawlen - 1);
-	    if (!nextline) nextline = state->rawbase + state->rawlen;
-	    endline = nextline;
-	    while (endline > state->rawbase &&
-		   (endline[-1] == ' ' || endline[-1] == '\t')) {
-		endline--;
-	    }
-	}
-	if (state->rawbase >= endline) {
-	    state->rawbase += nextline - endline;
-	    state->rawlen -= nextline - endline;
-	    continue;
-	}
-
-	c = state->rawbase[0];
-	if (c == '=') {
-	    if (state->rawbase+1 == endline) {
-		state->rawbase = nextline + 1;
-		state->rawlen -= 3 + (nextline - endline);
-
-		continue;
-	    }
-	    if (state->rawlen < 3) {
-		return retval;
-	    }
-	    c1 = state->rawbase[1];
-	    c2 = state->rawbase[2];
-	    state->rawbase += 3;
-	    state->rawlen -= 3;
-	    if (c2 == '\n') state->rawlen--;
-	    c1 = HEXCHAR(c1);
-	    c2 = HEXCHAR(c2);
-	    if (c1 == XX && c2 == XX) continue;
-	    *buf++ = (char)((c1 << 4) + c2);
-	    retval++;
-	    size--;
-	}
-	else if (c == '\n') {
-	    if (size < 2) {
-		return retval;
-	    }
-	    state->rawbase++;
-	    state->rawlen -= 2;
-	    *buf++ = '\r';
-	    *buf++ = '\n';
-	    retval += 2;
-	    size -= 2;
-	}
-	else {
-	    state->rawbase++;
-	    state->rawlen--;
-	    *buf++ = (char)c;
-	    retval++;
-	    size--;
-	}
+    convert_catn(input, msg_base, len);
+
+    /* extract the string from the buffer, messy - but we want to
+     * do it without becoming a cstring or being prematurely freed! */
+    {
+	struct buffer_state *buf = (struct buffer_state *)tobuffer->state;
+	*retval = buf->base;
+	*outlen = buf->offset;
+	buf->base = 0;
+	buf->alloced = 0;
+	buf->offset = 0;
     }
-    return retval;
-}
 
-/*
- * Helper function to read at most 'size' bytes of base64
- * transfer-decoded data into 'buf'.  Returns the number of decoded
- * bytes, or 0 for end-of-data.
- */
-static int charset_readbase64(struct input_state *state, char *buf, int size)
-{
-    int retval = 0;
-    int c1, c2, c3, c4;
-
-    while (size >= 3 && state->rawlen) {
-	do {
-	    c1 = *state->rawbase++;
-	    state->rawlen--;
-	    if (c1 == '=') {
-		state->rawlen = 0;
-		return retval;
-	    }
-	} while (state->rawlen && CHAR64(c1) == XX);
-	if (!state->rawlen) {
-	    return retval;
-	}
-
-	do {
-	    c2 = *state->rawbase++;
-	    state->rawlen--;
-	    if (c2 == '=') {
-		state->rawlen = 0;
-		return retval;
-	    }
-	} while (state->rawlen && CHAR64(c2) == XX);
-	if (!state->rawlen) {
-	    return retval;
-	}
+    convert_free(input);
 
-	do {
-	    c3 = *state->rawbase++;
-	    state->rawlen--;
-	    if (c3 == '=') {
-		*buf++ = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4));
-		retval++;
-		state->rawlen = 0;
-		return retval;
-	    }
-	} while (state->rawlen && CHAR64(c3) == XX);
-	if (!state->rawlen) {
-	    return retval;
-	}
-
-	do {
-	    c4 = *state->rawbase++;
-	    state->rawlen--;
-	    if (c4 == '=') {
-		*buf++ = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4));
-		*buf++ = (char)(((CHAR64(c2)&0xf)<<4) | ((CHAR64(c3)&0x3c)>>2));
-		retval += 2;
-		state->rawlen = 0;
-		return retval;
-	    }
-	} while (state->rawlen && CHAR64(c4) == XX);
-	if (CHAR64(c4) == XX) {
-	    return retval;
-	}
-
-	*buf++ = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4));
-	*buf++ = (char)(((CHAR64(c2)&0xf)<<4) | ((CHAR64(c3)&0x3c)>>2));
-	*buf++ = (char)(((CHAR64(c3)&0x3)<<6) | CHAR64(c4));
-	retval += 3;
-	size -= 3;
-    }
-    return retval;
-}
-
-/*
- * Helper function to read at most 'size' bytes of base64
- * transfer-decoded data into 'buf'.  Returns the number of decoded
- * bytes, or 0 for end-of-data.  Removes any US-ASCII whitespace.
- */
-static int charset_readbase64_nospc(struct input_state *state, 
-				    char *buf, int size)
-{
-    int retval = 0;
-    int c1, c2, c3, c4;
-    char dec;
-
-    while (size >= 3 && state->rawlen) {
-	do {
-	    c1 = *state->rawbase++;
-	    state->rawlen--;
-	    if (c1 == '=') {
-		state->rawlen = 0;
-		return retval;
-	    }
-	} while (state->rawlen && CHAR64(c1) == XX);
-	if (!state->rawlen) {
-	    return retval;
-	}
-
-	do {
-	    c2 = *state->rawbase++;
-	    state->rawlen--;
-	    if (c2 == '=') {
-		state->rawlen = 0;
-		return retval;
-	    }
-	} while (state->rawlen && CHAR64(c2) == XX);
-	if (!state->rawlen) {
-	    return retval;
-	}
-
-	do {
-	    c3 = *state->rawbase++;
-	    state->rawlen--;
-	    if (c3 == '=') {
-		dec = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4));
-		if (USASCII(dec) != END) {
-		    *buf++ = dec;
-		    retval++;
-		}
-		state->rawlen = 0;
-		return retval;
-	    }
-	} while (state->rawlen && CHAR64(c3) == XX);
-	if (!state->rawlen) {
-	    return retval;
-	}
-
-	do {
-	    c4 = *state->rawbase++;
-	    state->rawlen--;
-	    if (c4 == '=') {
-		dec = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4));
-		if (USASCII(dec) != END) {
-		    *buf++ = dec;
-		    retval++;
-		}
-		dec = (char)(((CHAR64(c2)&0xf)<<4) | ((CHAR64(c3)&0x3c)>>2));
-		if (USASCII(dec) != END) {
-		    *buf++ = dec;
-		    retval++;
-		}
-		state->rawlen = 0;
-		return retval;
-	    }
-	} while (state->rawlen && CHAR64(c4) == XX);
-	if (CHAR64(c4) == XX) {
-	    return retval;
-	}
-
-	dec  = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4));
-	if (USASCII(dec) != END) {
-	    *buf++ = dec;
-	    retval++;
-	    size--;
-	}
-	dec = (char)(((CHAR64(c2)&0xf)<<4) | ((CHAR64(c3)&0x3c)>>2));
-	if (USASCII(dec) != END) {
-	    *buf++ = dec;
-	    retval++;
-	    size--;
-	}
-	dec = (char)(((CHAR64(c3)&0x3)<<6) | CHAR64(c4));
-	if (USASCII(dec) != END) {
-	    *buf++ = dec;
-	    retval++;
-	    size--;
-	}
-    }
-    return retval;
+    return *retval;
 }
 
 /*
@@ -1345,8 +1230,8 @@ static int charset_readbase64_nospc(struct input_state *state,
 static char base_64[] =
     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
-char *charset_encode_mimebody(const char *msg_base, int len,
-			      char *retval, int *outlen, int *outlines)
+char *charset_encode_mimebody(const char *msg_base, size_t len,
+    char *retval, size_t *outlen, int *outlines)
 {
     const unsigned char *s;
     unsigned char s0, s1, s2;
@@ -1403,3 +1288,4 @@ char *charset_encode_mimebody(const char *msg_base, int len,
 
     return (b64_len ? retval : NULL);
 }
+
diff --git a/lib/charset.h b/lib/charset.h
index 573975a..4e495e8 100644
--- a/lib/charset.h
+++ b/lib/charset.h
@@ -44,10 +44,6 @@
 #ifndef INCLUDED_CHARSET_H
 #define INCLUDED_CHARSET_H
 
-/* Marker to indicate characters that don't map to anything */
-#define EMPTY 'X'
-#define EMPTY_STRING "X"
-
 #define ENCODING_NONE 0
 #define ENCODING_QP 1
 #define ENCODING_BASE64 2
@@ -60,22 +56,27 @@ typedef int charset_index;
 
 /* ensure up to MAXTRANSLATION times expansion into buf */
 extern char *charset_convert(const char *s, charset_index charset, char *buf,
-    int bufsz);
-extern char *charset_decode_mimeheader(const char *s, char *buf, int bufsz);
+    size_t bufsz);
+extern char *charset_decode_mimeheader(const char *s, char *buf, 
+    size_t bufsz);
+extern char *charset_parse_mimeheader(const char *s);
 
 extern charset_index charset_lookupname(const char *name);
 extern comp_pat *charset_compilepat(const char *s);
 extern void charset_freepat(comp_pat *pat);
 extern int charset_searchstring(const char *substr, comp_pat *pat,
-    const char *s, int len);
+    const char *s, size_t len);
 extern int charset_searchfile(const char *substr, comp_pat *pat,
-                              const char *msg_base, int mapnl, int len, 
+                              const char *msg_base, int mapnl, size_t len, 
                               charset_index charset, int encoding);
-extern char *charset_decode_mimebody(const char *msg_base, int len,
-				     int encoding, char **retval, int alloced,
-				     int *outlen);
-extern char *charset_encode_mimebody(const char *msg_base, int len,
-				     char *retval, int *outlen, int *outlines);
+extern char *charset_decode_mimebody(const char *msg_base, size_t len,
+				     int encoding, char **retval, size_t alloced,
+				     size_t *outlen);
+extern char *charset_encode_mimebody(const char *msg_base, size_t len,
+				     char *retval, size_t *outlen, 
+				     int *outlines);
+extern char *charset_to_utf8(const char *msg_base, size_t len, charset_index charset, int encoding);
+extern int charset_search_mimeheader(const char *substr, comp_pat *pat, const char *s, int searchform);
 
 /* Definitions for charset_extractfile */
 
@@ -121,7 +122,7 @@ typedef void index_search_text_receiver_t(int UID, int part, int cmds,
    by index_getsearchtextmsg to extract the MIME body parts. */ 
 extern int charset_extractfile(index_search_text_receiver_t receiver,
                                void* rock, int uid, const char *msg_base, 
-                               int mapnl, int len, charset_index charset,
+                               int mapnl, size_t len, charset_index charset,
                                int encoding);
 
 #endif /* INCLUDED_CHARSET_H */
diff --git a/lib/charset/iso-2022-jp.t b/lib/charset/iso-2022-jp.t
index 8d1a161..8c12572 100644
--- a/lib/charset/iso-2022-jp.t
+++ b/lib/charset/iso-2022-jp.t
@@ -67,7 +67,7 @@
 18 0018 CANCEL (CAN)
 19 0019 END OF MEDIUM (EM)
 1A 001a SUBSTITUTE (SUB)
-1B >ESC
+# 1B >ESC
 1C 001c FILE SEPARATOR (IS4)
 1D 001d GROUP SEPARATOR (IS3)
 1E 001e RECORD SEPARATOR (IS2)
@@ -169,17 +169,11 @@
 7E 007e TILDE
 7F 007f DELETE (DEL)
 
-:ESC <
-24 :ESC-$
-28 :ESC-(
-
-:ESC-( <
-42 :US-ASCII
-4A :JIS-0201
-
-:ESC-$ <
-40 :JIS-0208
-42 :JIS-0208
+# ESCAPE CODES
+1B2440 :JIS-0208 ESC-$-@
+1B2442 :JIS-0208 ESC-$-B
+1B2842 :US-ASCII ESC-(-B
+1B284A :JIS-0201 ESC-(-J
 
 :JIS-0201
  0 0000 NULL (NUL)
@@ -209,7 +203,7 @@
 18 0018 CANCEL (CAN)
 19 0019 END OF MEDIUM (EM)
 1A 001a SUBSTITUTE (SUB)
-1B >ESC
+# 1B >ESC
 1C 001c FILE SEPARATOR (IS4)
 1D 001d GROUP SEPARATOR (IS3)
 1E 001e RECORD SEPARATOR (IS2)
@@ -375,8 +369,13 @@ DD FF9D HALFWIDTH KATAKANA LETTER N
 DE FF9E HALFWIDTH KATAKANA VOICED SOUND MARK
 DF FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
 
+# ESCAPE CODES
+1B2440 :JIS-0208 ESC-$-@
+1B2442 :JIS-0208 ESC-$-B
+1B2842 :US-ASCII ESC-(-B
+1B284A :JIS-0201 ESC-(-J
+
 :JIS-0208
-1B >ESC
 2121 3000 IDEOGRAPHIC SPACE
 2122 3001 IDEOGRAPHIC COMMA
 2123 3002 IDEOGRAPHIC FULL STOP
@@ -7256,3 +7255,9 @@ DF FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
 7424 7464 <CJK>
 7425 51DC <CJK>
 7426 7199 <CJK>
+
+# ESCAPE CODES
+1B2440 :JIS-0208 ESC-$-@
+1B2442 :JIS-0208 ESC-$-B
+1B2842 :US-ASCII ESC-(-B
+1B284A :JIS-0201 ESC-(-J
diff --git a/lib/charset/iso-2022-kr.t b/lib/charset/iso-2022-kr.t
index 1ded497..ba14972 100644
--- a/lib/charset/iso-2022-kr.t
+++ b/lib/charset/iso-2022-kr.t
@@ -67,7 +67,7 @@
 18 0018 CANCEL (CAN)
 19 0019 END OF MEDIUM (EM)
 1A 001a SUBSTITUTE (SUB)
-1B >ESC
+# 1B >ESC
 1C 001c FILE SEPARATOR (IS4)
 1D 001d GROUP SEPARATOR (IS3)
 1E 001e RECORD SEPARATOR (IS2)
@@ -169,14 +169,8 @@
 7E 007e TILDE
 7F 007f DELETE (DEL)
 
-:ESC <
-24 :ESC-$
-
-:ESC-$ <
-29 :ESC-$-)
-
-:ESC-$-) <
-43 :US-ASCII
+# ESCAPE CODE (pretty pointless, but need to return nothing if we follow this path)
+1B242943 :US-ASCII ESC-$-)-C
 
 :KSC-5601
 0f :US-ASCII
diff --git a/lib/chartable.h b/lib/chartable.h
index 0b9c3fc..63dd2d0 100644
--- a/lib/chartable.h
+++ b/lib/chartable.h
@@ -42,31 +42,14 @@
  * $Id: chartable.h,v 1.6 2008/03/24 17:43:08 murch Exp $
  */
 
-/* note that these are all uppercase letters. since the translation
-   tables canonicalize to lower case letters, we never see these bytes
-   in the output UTF-8 and they're safely used as control codes to the
-   character decoder. */
-
-/* note that currently we never return a character that is represented
- * by more than 3 octets in UTF-8, since we only deal with characters
- * in UCS-2. this means that 11110xxx, 111110xx, and 1111110x never
- * appear in our outgoing tables, and could be used instead of the following.
- */
-
-#define XLT 'N'			/* Long translation */
-#define U7F 'O'			/* UTF-7 first base64 character */
-#define U7N 'P'			/* UTF-7 subsquent base64 character */
-#define U83 'Q'			/* UTF-8 3-char sequence */
-#define U83_2 'R'		/* second char of same */
-#define U83_3 'S'		/* third char of same */
-#define JSR 'T'
-#define JMP 'U'
-#define RET 'V'
-#define END 'W'
+struct charmap {
+    unsigned int c;
+    unsigned char next;
+};
 
 struct charset {
     char *name;
-    const unsigned char (*table)[256][4];
+    const struct charmap (*table)[256];
 };
 
 
diff --git a/lib/mkchartable.c b/lib/mkchartable.c
deleted file mode 100644
index 1980258..0000000
--- a/lib/mkchartable.c
+++ /dev/null
@@ -1,975 +0,0 @@
-/* mkchartable.c -- Generate character set mapping table
- *
- * Copyright (c) 1994-2008 Carnegie Mellon University.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. The name "Carnegie Mellon University" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For permission or any legal
- *    details, please contact
- *      Carnegie Mellon University
- *      Center for Technology Transfer and Enterprise Creation
- *      4615 Forbes Avenue
- *      Suite 302
- *      Pittsburgh, PA  15213
- *      (412) 268-7393, fax: (412) 268-7395
- *      innovation@andrew.cmu.edu
- *
- * 4. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by Computing Services
- *     at Carnegie Mellon University (http://www.cmu.edu/computing/)."
- *
- * CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
- * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
- * AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
- * FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * $Id: mkchartable.c,v 1.28 2009/03/31 04:11:22 brong Exp $
- */
-
-#include <config.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include "xmalloc.h"
-#include "util.h"
-
-#define XX 127
-/*
- * Table for decoding hexadecimal
- */
-static const char index_hex[256] = {
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-     0, 1, 2, 3,  4, 5, 6, 7,  8, 9,XX,XX, XX,XX,XX,XX,
-    XX,10,11,12, 13,14,15,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,10,11,12, 13,14,15,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-    XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
-};
-#define HEXCHAR(c)  (index_hex[(unsigned char)(c)])
-
-#define MAX_MAPCODE 20
-
-struct cmap {
-    int code;
-    int num_mapcode;
-    int mapcode[MAX_MAPCODE];
-    char *translation;
-    int trans_offset;
-};
-
-struct cmap *map=NULL;
-int map_num=0;
-int map_alloc=0;
-#define MAPGROW 200
-
-struct tablechar {
-    int code;
-    char *translation;
-    int trans_offset;
-    char *action;
-    char *comment;
-};
-#define EMPTYTCHAR(tc) ((tc).code == -1 && !(tc).translation && !(tc).action)
-
-struct table {
-    char *name;
-    char *endaction;
-    struct tablechar ch[256];
-};
-
-struct table *table=NULL;
-int table_num=0;
-int table_alloc=0;
-#define TABLEGROW 200
-
-static void readmapfile(char *name);
-static void mungemappings(void);
-static void readcharfile(char *name);
-static void printtable(char *name);
-static void freetabledata(void);
-static void freetable(void);
-static void freemap(void);
-static void usage(void);
-static int newstate(char *args);
-static int findstate(char *name);
-static void mkunicodetable(void);
-static void mkutf8table(void);
-static void mkutf7table(void);
-
-int
-main(int argc, char **argv)
-{
-    int opt;
-
-    while ((opt = getopt(argc, argv, "m:")) != EOF) {
-	switch (opt) {
-	case 'm':
-	    readmapfile(optarg);
-	    break;
-
-	default:
-	    usage();
-	}
-    }
-
-    if (map_num == 0 || argc == optind) usage();
-
-    printf("#include \"charset.h\"\n");
-    printf("#include \"chartable.h\"\n");
-
-    mungemappings();
-
-    fprintf(stderr, "mkchartable: mapping unicode...\n");
-    mkunicodetable();
-    printtable("unicode");
-
-    fprintf(stderr, "mkchartable: mapping UTF-8...\n");
-    mkutf8table();
-    printtable("utf-8");
-
-    fprintf(stderr, "mkchartable: mapping UTF-7...\n");
-    mkutf7table();
-    printtable("utf-7");
-
-    while (argv[optind]) {
-	fprintf(stderr, "mkchartable: mapping %s...\n", argv[optind]);
-	readcharfile(argv[optind]);
-	printtable(argv[optind]);
-	freetabledata();
-	optind++;
-    }
-
-    printf("/*\n");
-    printf(" * Mapping of character sets to tables\n");
-    printf(" */\n");
-    printf("const struct charset chartables_charset_table[] = {\n");
-    printf("    { \"us-ascii\", chartables_us_ascii },	/* US-ASCII must be charset number 0 */\n");
-    printf("    { \"utf-8\", chartables_utf_8 },\n");
-    printf("    { \"utf-7\", chartables_utf_7 },\n");
-    printf("    { \"iso-8859-1\", chartables_iso_8859_1 },\n");
-    printf("    { \"iso-8859-2\", chartables_iso_8859_2 },\n");
-    printf("    { \"iso-8859-3\", chartables_iso_8859_3 },\n");
-    printf("    { \"iso-8859-4\", chartables_iso_8859_4 },\n");
-    printf("    { \"iso-8859-5\", chartables_iso_8859_5 },\n");
-    printf("    { \"iso-8859-6\", chartables_iso_8859_6 },\n");
-    printf("    { \"iso-8859-7\", chartables_iso_8859_7 },\n");
-    printf("    { \"iso-8859-8\", chartables_iso_8859_8 },\n");
-    printf("    { \"iso-8859-9\", chartables_iso_8859_9 },\n");
-    printf("    { \"koi8-r\", chartables_koi8_r },\n");
-    printf("    { \"iso-2022-jp\", chartables_iso_2022_jp },\n");
-    printf("    { \"iso-2022-kr\", chartables_iso_2022_kr },\n");
-    printf("    { \"gb2312\", chartables_gb2312 },\n");
-    printf("    { \"big5\", chartables_big5 },\n");
-    printf("    /* Compatibility names */\n");
-    printf("    { \"unicode-1-1-utf-7\", chartables_utf_7 },\n");
-    printf("    { \"unicode-2-0-utf-7\", chartables_utf_7 },\n");
-    printf("    { \"x-unicode-2-0-utf-7\", chartables_utf_7 },\n");
-    printf("    /* End Compatibility Names */\n");
-    printf("    { \"iso-8859-15\", chartables_iso_8859_15 },\n");
-    printf("    { \"windows-1252\", chartables_windows_1252 },\n");
-    printf("    { \"windows-1256\", chartables_windows_1256 },\n");
-    printf("    { \"windows-1250\", chartables_windows_1250 },\n");
-    printf("    { \"windows-1251\", chartables_windows_1251 },\n");
-    printf("    { \"windows-1255\", chartables_windows_1255 },\n");
-    printf("    /* New character sets should only be added to end so that\n");
-    printf("     * cache files stay with valid information */\n");
-    printf("};\n");
-    printf("const int chartables_num_charsets = (sizeof(chartables_charset_table)/sizeof(*chartables_charset_table));\n");
-
-    freetable();
-    freemap();
-
-    return 0;
-}
-
-static void usage(void)
-{
-    fprintf(stderr, "usage: mkchartable -m mapfile charsetfile...\r\n");
-    exit(1);
-}
-
-/* Read a Unicode table, deriving useful mappings from it */
-static void
-readmapfile(char *name)
-{
-    FILE *mapfile;
-    char buf[1024];
-    char *p;
-    int line = 0;
-    int n, code, i, c;
-    static struct cmap zeromap;
-
-    mapfile = fopen(name, "r");
-    if (!mapfile) {
-	perror(name);
-	exit(1);
-    }
-
-    while (fgets(buf, sizeof(buf), mapfile)) {
-	line++;
-	p = buf;
-	while (*p && Uisspace(*p)) p++;
-	if (!*p || *p == '#') continue;
-
-	/* Unicode character */
-	code = 0;
-	for (i=0; i<4; i++) {
-	    c = HEXCHAR(*p);
-	    p++;
-	    if (c == XX) goto syntaxerr;
-	    code = code*16 + c;
-	}
-	if (*p++ != ';') goto syntaxerr;
-
-	/* Character name */
-	while (*p && *p != ';') p++;
-	if (*p++ != ';') goto syntaxerr;
-	   
-	if (map_num == map_alloc) {
-	    map_alloc += MAPGROW;
-	    map = (struct cmap *)
-		xrealloc((char *)map, map_alloc * sizeof(struct cmap));
-	}
-	map[map_num] = zeromap;
-	map[map_num].code = code;
-	
-	/* General Category */
-	if (*p == 'Z') {
-	    /* Is whitespace, map to empty string */
-	    map[map_num].num_mapcode = 0;
-	    map_num++;
-	    continue;
-	}
-	while (*p && *p != ';') p++;
-	if (*p++ != ';') goto syntaxerr;
-
-	/* Canonical Combining Class */
-	while (*p && *p != ';') p++;
-	if (*p++ != ';') goto syntaxerr;
-
-	/* Bidirectional category */
-	while (*p && *p != ';') p++;
-	if (*p++ != ';') goto syntaxerr;
-
-	/* Character decomposition */
-	n = 0;
-	while (*p && *p != ';') {
-	    if (n + 1 == MAX_MAPCODE) goto syntaxerr;
-	    if (*p == '<') {
-		/* Compatability mapping, skip over the <type> */
-		p = strchr(p, '>');
-		if (!p || p[1] != ' ') goto syntaxerr;
-		p += 2;
-
-		/* Ignore compat mappings to SP followed by combining char */
-		if (!strncmp(p, "0020 ", 5)) {
-		    p = strchr(p, ';');
-		    break;
-		}
-	    }
-
-	    code = 0;
-	    for (i=0; i<4; i++) {
-		c = HEXCHAR(*p);
-		p++;
-		if (c == XX) goto syntaxerr;
-		code = code*16 + c;
-	    }
-	    if (*p == ' ') p++;
-	    map[map_num].mapcode[n++] = code;
-	}
-	if (*p++ != ';') goto syntaxerr;
-
-	/* Decimal digit value */
-	while (*p && *p != ';') p++;
-	if (*p++ != ';') goto syntaxerr;
-			   
-	/* Digit value */
-	while (*p && *p != ';') p++;
-	if (*p++ != ';') goto syntaxerr;
-
-	/* Numeric value */
-	while (*p && *p != ';') p++;
-	if (*p++ != ';') goto syntaxerr;
-
-	/* Mirrored character */
-	while (*p && *p != ';') p++;
-	if (*p++ != ';') goto syntaxerr;
-
-	/* Unicode 1.0 name */
-	while (*p && *p != ';') p++;
-	if (*p++ != ';') goto syntaxerr;
-
-	/* Comment */
-	while (*p && *p != ';') p++;
-	if (*p++ != ';') goto syntaxerr;
-
-	/* Upper case equivalent mapping */
-	while (*p && *p != ';') p++;
-	if (*p++ != ';') goto syntaxerr;
-
-	/* Lower case equivalent mapping */
-	if (*p == ';') {
-	    /* No case mapping, use any decomposition we found above */
-	    if (n) {
-		map[map_num].num_mapcode = n;
-		map_num++;
-	    }
-	    continue;
-	}
-	code = 0;
-	for (i=0; i<4; i++) {
-	    c = HEXCHAR(*p);
-	    p++;
-	    if (c == XX) goto syntaxerr;
-	    code = code*16 + c;
-	}
-	if (*p != ';') goto syntaxerr;
-	map[map_num].mapcode[0] = code;
-	map[map_num].num_mapcode = 1;
-	map_num++;
-    }
-    fclose(mapfile);
-    return;
- syntaxerr:
-    fprintf(stderr, "%s: line %d: syntax error\n", name, line);
-    exit(1);
-}
-
-/* Perform the transitive closure on the unicode mapping table
- * Calculate translations for mappings
- */
-static void
-mungemappings(void)
-{
-    int didchange;
-    int n, newn, n_mapcode, i;
-    int new_mapcode[MAX_MAPCODE];
-    int num_new_mapcode;
-    int last_translation = 1;
-    int max_len = 3;
-    
-    /* Keep scanning the table until no changes are made */
-    do {
-	didchange = 0;
-
-	fprintf(stderr, "mkchartable: expanding unicode mappings...\n");
-
-	for (n = 0; n < map_num; n++) {
-	    /* Build new map code sequence by iterating over existing
-	     * mapcode sequence
-	     */
-	    num_new_mapcode = 0;
-	    for (n_mapcode = 0; n_mapcode < map[n].num_mapcode; n_mapcode++) {
-
-		/* Search for a translation of this particular code */
-		for (newn = 0; newn < map_num; newn++) {
-		    if (map[newn].code == map[n].mapcode[n_mapcode]) break;
-		}
-		if (newn != map_num) {
-		    /* We have a translation */
-		    didchange++;
-		    for (i = 0; i < map[newn].num_mapcode; i++) {
-			new_mapcode[num_new_mapcode++] = map[newn].mapcode[i];
-		    }
-		}
-		else {
-		    /* Keep the old mapping for this code */
-		    new_mapcode[num_new_mapcode++] = map[n].mapcode[n_mapcode];
-		}
-	    }
-
-	    /* Copy in the new translation */
-	    map[n].num_mapcode = num_new_mapcode;
-	    memcpy(map[n].mapcode, new_mapcode, sizeof(new_mapcode));
-	}
-    } while (didchange);
-
-    printf("/* The following unicode mapping table is in effect\n");
-    printf("From To\n");
-    for (n = 0; n < map_num; n++) {
-	printf("\n%04x", map[n].code);
-	for (i = 0; i < map[n].num_mapcode; i++) {
-	    printf(" %04x", map[n].mapcode[i]);
-	}
-    }
-    printf("\n*/\n");
-
-    fprintf(stderr, "mkchartable: building expansion table...\n");    
-
-    printf("/* Table of traslations longer than three octets.\n");
-    printf(" * The XLT code in other tables is followed by an 2-octet\n");
-    printf(" * index into this table.\n");
-    printf(" * The index of 0 is reserved to mean 'no translation'\n");
-    printf(" */\n");
-    printf("const unsigned char chartables_long_translations[] = { 0, \n");
-
-    for (n = 0; n < map_num; n++) {
-	int n_mapcode, code;
-	unsigned char translation[256];
-	int n_t;
-	
-	/* Build translation strings for mappings to 0 or multiple codes */
-	if (map[n].num_mapcode == 0) {
-	    map[n].translation = xstrdup("");
-	}
-	else if (map[n].num_mapcode > 1) {
-	    n_t = 0;
-	    for (n_mapcode = 0; n_mapcode < map[n].num_mapcode; n_mapcode++) {
-		code = map[n].mapcode[n_mapcode];
-		/* Convert code to UTF-8 */
-		if (code && code <= 0x7f) {
-		    translation[n_t++] = (unsigned char)code;
-		}
-		else if (code <= 0x7FF) {
-		    translation[n_t++] = (unsigned char) (0xc0 + (code>>6));
-		    translation[n_t++] = (unsigned char) (0x80+(code&0x3f));
-		}
-		else {
-		    translation[n_t++] = (unsigned char) (0xe0 + (code>>12));
-		    translation[n_t++] = (unsigned char) (0x80+((code>>6)&0x3f));
-		    translation[n_t++] = (unsigned char) (0x80+(code&0x3f));
-		}
-	    }
-	    if (n_t <= 3) {
-		map[n].translation = xmalloc(4);
-		memcpy(map[n].translation, translation, n_t);
-		map[n].translation[n_t] = '\0';
-	    }
-	    else {
-		if (n_t > max_len) max_len = n_t;
-		for (i = 0; i < n_t; i++) {
-		    code = translation[i];
-		    if (isprint(code) && code != '\\' && code != '\"' && code != '\'') {
-			printf(" '%c',", code);
-		    } else {
-			printf(" %3d,", code);
-		    }
-		}
-		printf(" END, /* Translation for %04x (offset %04x) */\n",
-		       map[n].code, last_translation);
-		map[n].trans_offset = last_translation;
-
-                /* last_translation points to the offset the next translation will start from */
-		last_translation += n_t + 1;
-	    }
-	}
-    }
-    printf("};\n\n const int charset_max_translation = %d;\n\n", max_len);
-}
-
-static void
-setcode(int state, int character, int code)
-{
-    int i = 0;
-
-    for (i = 0; i < map_num; i++) {
-	if (map[i].code == code) break;
-    }
-
-    if (i == map_num) {
-	table[state].ch[character].code = code;
-    } else if (map[i].translation) {
-	table[state].ch[character].translation = map[i].translation;
-    } else if (map[i].trans_offset) {
-	table[state].ch[character].trans_offset = map[i].trans_offset;
-    } else {
-	table[state].ch[character].code = map[i].mapcode[0];
-    }
-	
-}
-
-static void
-readcharfile(char *name)
-{
-    FILE *charfile;
-    char buf[1024];
-    char *p;
-    int line = 0;
-    int curstate = -1;
-    int thischar, thisstate;
-    int code, i, c;
-    
-    charfile = fopen(name, "r");
-    if (!charfile) {
-	perror(name);
-	exit(1);
-    }
-
-    table_num = 0;
-
-    while (fgets(buf, sizeof(buf), charfile)) {
-	line++;
-	p = buf + strlen(buf);
-	if (p > buf && p[-1] == '\n') p[-1] = '\0';
-	p = buf;
-	while (*p && Uisspace(*p)) p++;
-	if (!*p || *p == '#') continue;
-
-	if (*p == ':') {
-	    /* New state */
-	    curstate = newstate(p+1);
-	    continue;
-	}
-	
-	if (curstate == -1) {
-	    curstate = newstate("");
-	}
-
-	thisstate = curstate;
-	thischar = i = 0;
-	while (!Uisspace(*p)) {
-	    c = HEXCHAR(*p);
-	    i++;
-	    p++;
-	    if (c == XX) goto syntaxerr;
-	    thischar = thischar*16 + c;
-	}
-	while (*p && Uisspace(*p)) p++;
-
-	if (i > 4) goto syntaxerr;	
-	if (i > 2) {
-	    if (EMPTYTCHAR(table[thisstate].ch[thischar>>8])) {
-                /* we create a new state (not in the input file) to
-                   deal with multibyte characters that start with the
-                   byte 'thischar >> 8'. */
-
-		char action[1024];
-
-		sprintf(action, ">%s_%02x <", table[thisstate].name,
-			thischar>>8);
-		table[thisstate].ch[thischar>>8].action = xstrdup(action);
-		*(strchr(table[thisstate].ch[thischar>>8].action, ' ')) = '\0';
-		table[thisstate].ch[thischar>>8].comment = xstrdup("multi-byte");
-		thisstate = newstate(action+1);
-	    }
-	    else if (!table[thisstate].ch[thischar>>8].action ||
-		     table[thisstate].ch[thischar>>8].action[0] != '>') {
-                /* either we think this byte isn't the start of a
-                   multibyte character, or the action associated with this
-                   byte isn't a state change. */
-
-		fprintf(stderr,
-			"%s: line %d: multibyte/single-byte conflict\n",
-			name, line);
-		exit(1);
-	    }
-	    else {
-                /* we find the already created state to deal with multibytes
-                   starting with 'thischar >> 8' and move to it so we
-                   insert the 2nd byte of this multibyte char in the right
-                   state. */
-
-		thisstate =
-		  findstate(table[thisstate].ch[thischar>>8].action+1);
-		if (thisstate == -1) {
-		    fprintf(stderr,
-			    "%s: line %d: can't find multibyte state\n",
-			    name, line);
-		    exit(1);
-		}
-	    }
-	    thischar &= 0xff;
-	}
-
-	if (!EMPTYTCHAR(table[thisstate].ch[thischar])) {
-	    fprintf(stderr, "%s: line %d: duplicate defs for %x\n",
-		    name, line, thischar);
-	    exit(1);
-	}
-
-	table[thisstate].ch[thischar].comment = xstrdup(buf);
-
-	if (*p == '?') {
-	    continue;
-	}
-
-	if (*p == ':' || *p == '>' || *p == '<') {
-	    p = table[thisstate].ch[thischar].action = xstrdup(p);
-	    while (*p && !Uisspace(*p)) p++;
-	    *p = '\0';
-	    continue;
-	}
-
-	code = 0;
-	for (i=0; i<4; i++) {
-	    c = HEXCHAR(*p);
-	    p++;
-	    if (c == XX) goto syntaxerr;
-	    code = code*16 + c;
-	}
-	setcode(thisstate, thischar, code);
-    }
-    fclose(charfile);
-    return;
- syntaxerr:
-    fprintf(stderr, "%s: line %d: syntax error\n", name, line);
-    exit(1);
-}
-
-/* Generate the table used for mapping raw unicode values */
-static void mkunicodetable(void)
-{
-    int i;
-    int thisstate;
-    unsigned char need_block[256];
-    int block;
-    char buf[80];
-
-    /* Record which blocks we need mappings for */
-    for (i = 0; i < 256; i++) {
-	need_block[i] = 0;
-    }
-    for (i = 0; i < map_num; i++) {
-	need_block[map[i].code>>8] = 1;
-    }
-
-    table_num = 0;
-
-    printf("/* The next two tables are used for doing translations on\n");
-    printf(" * 16-bit unicode values.  First look up the Unicode block\n");
-    printf(" * (high-order byte) in the chartables_unicode_block table\n");
-    printf(" * to find the index into chartables_unicode for that block.\n");
-    printf(" * If the index is 255, there are no translations for that\n");
-    printf(" * block, so characters can be encoded in UTF-8 algorithmically\n");
-    printf(" * Otherwise, look up the low-order byte in the chartables_unicode\n");
-    printf(" * using the index to select the state.\n");
-    printf(" */\n");
-    printf("const unsigned char chartables_unicode_block[256] = {");
-
-    for (block = 0; block < 256; block++) {
-	if (!(block & 0x7)) printf("\n");
-	if (!need_block[block]) {
-	    printf(" 255,");
-	    continue;
-	}
-
- 	sprintf(buf, "BLOCK-%02x-INDEX-%d", block, table_num);
-	thisstate = newstate(buf);
-	printf(" %3d,", thisstate);
-
-	for (i = 0; i < 256; i++) {
-	    setcode(thisstate, i, (block << 8) + i);
-	}
-    }
-
-    printf("\n};\n\n");
-
-    printf("/* NOTE: Unlike other charset translation tables, the \n");
-    printf(" * chartables_unicode table is NOT used to directly parse\n");
-    printf(" * a charset.  See the comment on chartables_unicode_block\n");
-    printf(" * for a descripton of how this table is used.\n");
-    printf(" */\n");
-}
-
-static void mkutf8table(void)
-{
-    int start_state, thisstate;
-    int thischar, prefix;
-    char buf[80];
-
-    table_num = 0;
-
-    start_state = newstate("START");
-
-    /* Populate the ascii section */
-    for (thischar = 0; thischar <= 0x7f; thischar++) {
-	setcode(start_state, thischar, thischar);
-    }
-
-    /* 3-char sequence tables must be numbered 1 and 2 */
-    thisstate = newstate("STATE-3-2 <");
-    for (thischar = 0x80; thischar <= 0xbf; thischar++) {
-	table[thisstate].ch[thischar].action = "U83_2";
-    }
-    thisstate = newstate("STATE-3-3 <");
-    for (thischar = 0x80; thischar <= 0xbf; thischar++) {
-	table[thisstate].ch[thischar].action = "U83_3";
-    }
-
-    /* Populate 2-char sequences---the first byte shifts to another
-     * state; the 2nd byte chooses the character, just like any other
-     * 2-byte encoding */
-    for (prefix = 2; prefix <= 0x1f; prefix++) {
-	sprintf(buf, ">STATE-2-%02x", prefix);
-	table[start_state].ch[prefix+0xc0].action = xstrdup(buf);
-	strcat(buf, " <");
-	thisstate = newstate(xstrdup(buf+1));
-	for (thischar = 0; thischar <= 0x3f; thischar++) {
-	    setcode(thisstate, thischar+0x80, thischar+(prefix<<6));
-	}
-    }
-
-    /* Populate 3-char sequences, which the decoder handles
-     * magically, outside of the state system. */
-    for (thischar = 0xe0; thischar <= 0xef; thischar++) {
-	table[start_state].ch[thischar].action = "U83";
-    }
-    
-}
-
-static char basis_64[] =
-   "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-static void mkutf7table(void)
-{
-    int start_state, thisstate;
-    int thischar;
-    char *p;
-
-    table_num = 0;
-
-    start_state = newstate("START");
-
-    /* Populate the ascii section */
-    table[start_state].ch['+'].action = ">GOTSHIFT";
-    for (thischar = 0; thischar <= 0x7f; thischar++) {
-	if (!table[start_state].ch[thischar].action) {
-	    setcode(start_state, thischar, thischar);
-	}
-    }
-
-    /* Normal base64 decoding table must be numbered 1 */
-    thisstate = newstate("B64NORMAL <");
-    table[thisstate].ch['-'].action = "<";
-    for (p = basis_64; *p; p++) {
-	table[thisstate].ch[*(unsigned char*)p].action = "U7N";
-    }
-    for (thischar = 0; thischar <= 0x7f; thischar++) {
-	if (!table[thisstate].ch[thischar].action) {
-	    setcode(thisstate, thischar, thischar);
-	}
-    }
-    
-    /* Populate initial base64 decoding table */
-    thisstate = newstate("GOTSHIFT <");
-    setcode(thisstate, '-', '+');
-    for (p = basis_64; *p; p++) {
-	table[thisstate].ch[*(unsigned char*)p].action = "U7F";
-    }
-}
-
-static int
-newstate(char *args)
-{
-    char *p;
-    int i;
-
-    if (table_num == table_alloc) {
-	table_alloc += TABLEGROW;
-	table = (struct table *)xrealloc((char *)table,
-					 table_alloc * sizeof(struct table));
-    }
-
-    table[table_num].name = xstrdup(args);
-    table[table_num].endaction = "END";
-    for (i = 0; i < 256; i++) {
-	table[table_num].ch[i].code = -1;
-	table[table_num].ch[i].translation = 0;
-	table[table_num].ch[i].trans_offset = 0;
-	table[table_num].ch[i].action = 0;
-	table[table_num].ch[i].comment = 0;
-    }
-
-    p = table[table_num].name;
-    while (*p && !Uisspace(*p)) p++;
-    if (*p) *p++ = '\0';
-    while (*p) {
-	if (*p == '<') table[table_num].endaction = "RET";
-	p++;
-    }
-
-    return table_num++;
-}
-
-static int
-findstate(char *name)
-{
-    int i;
-
-    for (i = 0; i < table_num; i++) {
-	if (!strcmp(name, table[i].name)) return i;
-    }
-    return -1;
-}
-
-static void
-printtable(char *name)
-{
-    char buf[1024];
-    char *p;
-    int curstate, thischar;
-    int code;
-    char *end;
-    int i;
-    
-    p = strrchr(name, '/');
-    if (!p) p = strrchr(name, '\\');
-    if (p) p++;
-    else p = name;
-    strcpy(buf, p);
-    if ((p = strchr(buf, '.')) != NULL) *p = '\0';
-    while ((p = strchr(buf, '-')) != NULL) *p = '_';
-
-    printf("const unsigned char chartables_%s[%d][256][4] = {\n", buf, table_num);
-
-    for (curstate = 0; curstate < table_num; curstate++) {
-	printf(" {");
-	if (table[curstate].name[0]) {
-	    printf(" /* %s */", table[curstate].name);
-	}
-	printf("\n");
-	
-	for (thischar = 0; thischar < 256; thischar++) {
-	    printf("   {");
-	    if ((code = table[curstate].ch[thischar].code) != -1) {
-		if (code && code <= 0x7f) {
-		    if (isprint(code) && code != '\\' && code != '\"' &&
-			code != '\'') {
-			printf(" '%c', %s,   0,   0,", code,
-			       table[curstate].endaction);
-		    }
-		    else {
-			printf(" %3d, %s,   0,   0,", code,
-			       table[curstate].endaction);
-		    }
-		}
-		else if (code <= 0x7FF) {
-		    printf(" %3d, %3d, %s,   0,", 0xc0 + (code>>6),
-			   0x80+(code&0x3f), table[curstate].endaction);
-		}
-		else {
-		    printf(" %3d, %3d, %3d, %s,", 0xe0 + (code>>12),
-			   0x80+((code>>6)&0x3f), 0x80+(code&0x3f),
-			   table[curstate].endaction);
-		}
-	    } else if ((code = table[curstate].ch[thischar].trans_offset) != 0) {
-		printf(" XLT, %3d, %3d, %s,", code >> 8, code & 0xff,
-		       table[curstate].endaction); 
-	    } else if ((p = table[curstate].ch[thischar].translation) != 0) {
-		end = table[curstate].endaction;
-		for (i = 0; i < 4; i++) {
-		    if (isprint((unsigned char)*p) && *p != '\\' && *p != '\"' && *p != '\'') {
-			printf(" '%c',", *p);
-		    }
-		    else if (!*p) {
-			printf(" %s,", end);
-			end = "  0";
-		    }
-		    else {
-			printf(" %3d,", (unsigned char)*p);
-		    }
-		    if (*p) p++;
-		}
-	    }
-	    else if ((p = table[curstate].ch[thischar].action) == 0) {
-		printf(" EMPTY, %s, 0,   0,", table[curstate].endaction);
-	    }
-	    else if (*p == '<') {
-		printf(" RET,   0,   0,   0,");
-	    }
-	    else if (*p == 'U') {
-		printf(" %s,   0,   0,   0,", p);
-	    }
-	    else {
-		code = findstate(p+1);
-		if (code == -1) {
-		    fprintf(stderr, "%s: unknown state %s\n", name, p+1);
-		}
-		printf(" %s, %3d, %3d,   0,",
-		       *p == '>' ? "JSR" : "JMP",
-		       (code>>8), (code&0xff));
-	    }
-	    printf(" },");
-	    if (table[curstate].ch[thischar].comment) {
-		printf(" /* %s */", table[curstate].ch[thischar].comment);
-	    }
-	    printf("\n");
-	}
-	printf(" },\n");
-    }
-    printf("};\n\n");
-}
-
-static void
-freetabledata(void)
-{
-    int curstate, thischar;
-/*    char *cp; */
-
-    for (curstate = 0; curstate < table_num; curstate++) {
-	for (thischar = 0; thischar < 256; thischar++) {
-	    if (table[curstate].ch[thischar].comment != NULL) {
-		free(table[curstate].ch[thischar].comment);
-	    }
-
-	    if (table[curstate].ch[thischar].action != NULL) {
-		free(table[curstate].ch[thischar].action);
-	    }
-	}
-	if (table[curstate].name != NULL) {
-	    free(table[curstate].name);
-	}
-    }
-}
-
-static void
-freetable(void)
-{
-    if (table_alloc) {
-        free(table);
-	table_alloc=0;
-    }
-}
-
-static void
-freemap(void)
-{
-    int n;
-/*	int n_mapcode; */
-
-    for (n = 0; n < map_num; n++) {
-	if (map[n].translation != NULL) {
-	    free(map[n].translation);
-	}
-    }
-
-    if (map_alloc) {
-        free(map);
-	map_alloc=0;
-    }
-}
-
-void fatal(const char* s, int c)
-{
-    fprintf(stderr, "Error while building charset table: %s\n", s);
-    exit(c);
-}
diff --git a/lib/mkchartable.pl b/lib/mkchartable.pl
new file mode 100644
index 0000000..b9fbbeb
--- /dev/null
+++ b/lib/mkchartable.pl
@@ -0,0 +1,531 @@
+#!/usr/bin/perl
+#
+# mkchartable.pl -- Generate character set mapping table
+#
+# Copyright (c) 1994-2008 Carnegie Mellon University.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# 3. The name "Carnegie Mellon University" must not be used to
+#    endorse or promote products derived from this software without
+#    prior written permission. For permission or any legal
+#    details, please contact
+#      Carnegie Mellon University
+#      Center for Technology Transfer and Enterprise Creation
+#      4615 Forbes Avenue
+#      Suite 302
+#      Pittsburgh, PA  15213
+#      (412) 268-7393, fax: (412) 268-7395
+#      innovation@andrew.cmu.edu
+#
+# 4. Redistributions of any form whatsoever must retain the following
+#    acknowledgment:
+#    "This product includes software developed by Computing Services
+#     at Carnegie Mellon University (http://www.cmu.edu/computing/)."
+#
+# CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
+# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+# AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
+# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+# OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+# $Id: mkchartable.pl,v 1.27 2008/03/24 17:43:09 murch Exp $
+
+use strict;
+use warnings;
+
+use IO::File;
+use Getopt::Long;
+use Digest::SHA1;
+
+my @maps;
+my %codemap;
+GetOptions( 'map|m=s' => \@maps );
+
+printheader(\@maps, \@ARGV);
+
+# first we parse the chartable unicode mappings and the fixes
+# file to build the unicode to search canonical form tables.
+foreach my $map (@maps) {
+    readmapfile(\%codemap, $map);
+}
+
+# we follow any mappings repeatedly until nothing in the 
+# table doesn't change any more
+mungemap(\%codemap);
+
+# then print out the translation tables
+printmap(\%codemap);
+
+# XXX - should probably require all files that are
+# mentioned in the lookup table to be specified,
+# or this sucker aintn't gunna compile.
+foreach my $opt (@ARGV) {
+    warn "mkchartable: mapping $opt...\n";
+    my $table = readcharfile($opt);
+    printtable($table, $opt);
+}
+
+printlookup();
+
+exit 0;
+
+sub usage {
+    warn "usage: mkchartable -m mapfile charsetfile...\n";
+    exit(1);
+}
+
+# Read a Unicode table, deriving useful mappings from it
+sub readmapfile {
+    my ($codemap, $name) = @_;
+
+    my $mapfile = IO::File->new($name, 'r') || die "Failed to open $name\n";
+
+    while (my $line = $mapfile->getline()) {
+	chomp $line;
+	$line =~ s/^\s+//; # strip leading space
+	next if $line =~ m/^\#/; # comment
+	next if $line eq ''; # blank
+
+	my ($hexcode, $name, $category, $combiningclass, $bidicat, 
+	    $decomposition, $decimal, $digit, $numeric, $mirroredchar,
+	    $uni1name, $comment, $upper, $lower, @rest) = split ';', $line;
+	my $code = hex($hexcode);
+
+	if ($code != 32 and $category =~ m/^Z/) {
+	   $codemap->{$code}{chars} = [32]; # space
+	   next;
+	}
+
+	# Compatability mapping, skip over the <type> 
+	while ($decomposition ne '') {
+	    if ($decomposition =~ s/^<[^>]*>\s+//) {
+		# Ignore compat mappings to SP followed by combining char 
+		$decomposition = '' if $decomposition =~ m/^0020 /
+	    }
+
+	    if ($decomposition =~ s/([0-9a-fA-F]+)\s*//) {
+		push @{$codemap->{$code}{chars}}, hex($1);
+	    }
+	}
+
+	# Lower case equivalent mapping
+	if ($lower) {
+	    $codemap->{$code}{chars} = [hex($lower)];
+	}
+    }
+}
+
+# Perform the transitive closure on the unicode mapping table
+# Calculate translations for mappings
+sub mungemap {
+    my ($codemap) = @_;
+
+    my $didchange = 1;
+    
+    # Keep scanning the table until no changes are made
+    while ($didchange) {
+	warn "mkchartable: expanding unicode mappings...\n";
+
+	$didchange = 0;
+
+        foreach my $code (sort { $a <=> $b } keys %$codemap) {
+	    my @new;
+	    my $chars = $codemap->{$code}{chars};
+
+	    # check if there are any translations for the mapped chars
+	    foreach my $char (@$chars) {
+		if ($codemap->{$char}) {
+		    $didchange = 1;
+	            my $newchars = $codemap->{$char}{chars};
+		    push @new, @$newchars;
+		}
+		else {
+		    push @new, $char;
+		}
+	    }
+
+	    # strip all whitespace, but put back one if nothing left
+	    if (grep { $_ == 32 } @new) {
+		@new = grep { $_ != 32 } @new;
+		@new = (32) unless @new;
+	    }
+
+	    $codemap->{$code}{chars} = \@new;
+	}
+    };
+
+    warn "mkchartable: building expansion table...\n";
+
+    print <<EOF;
+/* Table of translations */
+const int chartables_translation_multichar[] = {
+  0, /* the index of 0 is reserved to mean "no translation" */
+EOF
+
+    my $offset = 1;
+    my $maxlen = 1;
+
+    foreach my $code (sort { $a <=> $b } keys %$codemap) {
+	my $chars = $codemap->{$code}{chars};
+	if (@$chars > 1) {
+	    $maxlen = @$chars if $maxlen < @$chars;
+
+	    # add to the translation table
+	    print "  ";
+	    print join(", ", (map { sprintf("0x%04x", $_) } @$chars));
+	    printf ", 0, /* Translation for %04x (offset %d) */\n", $code, $offset;
+
+	    # update tracking
+	    $codemap->{$code}{trans} = $offset;
+	    $offset += @$chars + 1;
+	}
+    }
+
+    print <<EOF;
+};
+
+EOF
+}
+
+# output the tables used for canonising the unicode
+# into search normal form.
+sub printmap {
+    my ($codemap) = @_;
+
+    warn "mkchartable: building translation table...\n";
+
+    # record which blocks we need mappings for
+    my @needblock;
+    foreach my $code (keys %$codemap) {
+	$needblock[($code >> 16) & 0xff][($code >> 8) & 0xff] = 1;
+    }
+
+    print << "EOF";
+/* The next two tables are used for doing translations from
+ * 24-bit unicode values to canonical form.  First look up the
+ * code >> 16 (highest order block) in the block16 table to
+ * find the index to the block8 table for that block.
+ * If the index is 255, there are no translations for that
+ * block, so return the same value.  Otherwise, repeat for
+ * code >> 8 (middle block) to get an index into the
+ * direct translation block.  Again, 255 means no translations
+ * for that block.  Finally the translation can be one of.
+ *
+ * 0: no output
+ * +ve char: return this single char
+ * -ve number: offset into the chartables_translation_multichar
+ *             table.  Read chars until 0 encountered.
+ */
+const unsigned char chartables_translation_block16[256] = {
+EOF
+
+    my $n16 = 0;
+    foreach my $block16 (0..255) {
+	if ($needblock[$block16]) {
+	    printf(" %3d,", $n16++);
+	} else {
+	    printf(" 255,");
+	}
+ 	print "\n" if ($block16 % 8 == 7);
+    }
+
+    print <<EOF;
+};
+
+const unsigned char chartables_translation_block8[$n16][256] = {
+EOF
+    my $n8 = 0;
+    foreach my $block16 (0..255) {
+	my $need8 = $needblock[$block16];
+	next unless $need8;
+	print " { /* translation for 16 bit offset $block16 */\n ";
+	foreach my $block8 (0..255) {
+	    if ($need8->[$block8]) {
+		printf(" %3d,", $n8++);
+	    } else {
+		printf(" 255,");
+	    }
+ 	    print "\n " if ($block8 % 8 == 7);
+	}
+	print "},\n";
+    }
+
+    print <<EOF;
+};
+
+/* NOTE: Unlike other charset translation tables, the
+ * chartables_translation table is NOT used to directly parse
+ * a charset.  Instead, it's used to convert from a unicode
+ * character to the "canonical form", possibly multiple
+ * characters.
+ */
+const int chartables_translation[$n8][256] = {
+EOF
+
+    foreach my $block16 (0..255) {
+	my $need8 = $needblock[$block16];
+	next unless $need8;
+	foreach my $block8 (0..255) {
+	    next unless $need8->[$block8];
+    	    print " { /* Mapping for unicode chars in block $block16 $block8 */\n ";
+	    foreach my $i (0..255) {
+		my $codepoint = ($block16 << 16) + ($block8 << 8) + $i;
+		if (not $codemap->{$codepoint}) {
+		    printf " 0x%04x,", $codepoint;
+		}
+		elsif ($codemap->{$codepoint}{trans}) {
+		    printf " - %4d,", $codemap->{$codepoint}{trans};
+		}
+		else {
+		    printf " 0x%04x,", $codemap->{$codepoint}{chars}[0];
+		}
+ 		print "\n " if ($i % 8 == 7);
+	    }
+	    print "},\n";
+    	}
+    }
+    printf("};\n\n");
+}
+
+# read a charset table, building intermediate state tables
+# for multibyte sequences and named state tables for mode
+# switches
+sub readcharfile {
+    my ($name) = @_;
+    
+    my $charfile = IO::File->new($name, 'r') || die "Failed to read $name";
+
+    my %data = (
+	currstate => -1,
+	num => 0,
+	tables => [],
+	states => {},
+    );
+
+    my $state;
+
+    while (my $line = $charfile->getline()) {
+	chomp $line;
+	my $comment = $line;
+	$line =~ s/^\s+//; # strip leading space
+	next if $line =~ m/^\#/; # comment
+	next if $line eq ''; # blank
+
+	if ($line =~ m/^:(\S+)/) {
+	    # New state 
+	    $state = getstate(\%data, $1);
+	    next;
+	}
+	
+	$state ||= getstate(\%data, "");
+
+	die "Invalid data line $line\n" unless $line =~ s/^([0-9a-fA-F]+)\s+//;
+
+	my $code = hex($1);
+
+	my $basestate = $state;
+
+	if ($code > 0xffffff) {
+	   my $char = ($code >> 24) & 0xff;
+	   my $newname = sprintf "%s_%02x", $state->{name} || 'state', $char;
+	   my $newstate = getstate(\%data, $newname);
+	   $state->{chars}[$char] = [0, $newstate->{num}, "Auto multibyte state 4 bytes $newname"];
+	   $state = $newstate;
+	}
+	if ($code > 0xffff) {
+	   my $char = ($code >> 16) & 0xff;
+	   my $newname = sprintf "%s_%02x", $state->{name} || 'state', $char;
+	   my $newstate = getstate(\%data, $newname);
+	   $state->{chars}[$char] = [0, $newstate->{num}, "Auto multibyte state 3 bytes $newname"];
+	   $state = $newstate;
+	}
+	if ($code > 0xff) {
+	   my $char = ($code >> 8) & 0xff;
+	   my $newname = sprintf "%s_%02x", $state->{name} || 'state', $char;
+	   my $newstate = getstate(\%data, $newname);
+	   $state->{chars}[$char] = [0, $newstate->{num}, "Auto multibyte state 2 bytes $newname"];
+	   $state = $newstate;
+	}
+
+	my $char = $code & 0xff;
+	die "Duplicate defs for $char in $state->{name}"
+	    if $state->{chars}[$char];
+
+	# nothing
+	if ($line =~ m/^\?/) {
+	    next;
+	}
+
+	# state switch
+	if ($line =~ m/^:(\S*)/) {
+	    my $targetstate = getstate(\%data, $1);
+	    $state->{chars}[$char] = [0, $targetstate->{num}, $comment];
+	}
+	else {
+	    # otherwise it's a regular char
+	    die "Invalid data line $line\n" unless $line =~ s/^([0-9a-fA-F]+)\s+//;
+	    my $target = hex($1);
+	    $state->{chars}[$char] = [$target, $basestate->{num}, $comment];
+	}
+
+	$state = $basestate;
+    }
+
+    return \%data;
+}
+
+# helper function to create a new state within a charset
+sub getstate {
+    my ($data, $name) = @_;
+
+    if (exists $data->{states}{$name}) { # could be 0
+	return $data->{tables}[$data->{states}{$name}];
+    }
+
+    my $num = $data->{num};
+
+    my $next = $num;
+    if ($name =~ s/ \<$//) {
+	$next = -1;
+    }
+
+    my $state = $data->{tables}[$num] = {
+	name => $name,
+	num => $num,
+	next => $next,
+	codes => {},
+    };
+    $data->{states}{$name} = $num;
+
+    $data->{num}++;
+
+    return $state;
+}
+
+# output the table used for charset->unicode translation
+sub printtable {
+    my ($data, $name) = @_;
+
+    my $num = $data->{num};
+    my $tables = $data->{tables};
+
+    $name =~ s{.*[\\/]}{}; # strip anything up to the last separator;
+    $name =~ s{\..*}{}; # after a dot
+    $name =~ s{-}{_}g; # underscores
+
+    print "const struct charmap chartables_$name\[$num][256] = {\n";
+
+    foreach my $table (@$tables) {
+	my $chars = $table->{chars};
+	print " {";
+	if ($table->{name}) {
+	    print " /* $table->{name} */";
+	}
+	print "\n";
+	foreach my $i (0..255) {
+	    my $char = $chars->[$i];
+	    if ($char) {
+		print "   { $char->[0], $char->[1] }, /* $char->[2] */\n";
+	    }
+	    else {
+		print "   { 0, 0 }, /* no entry */\n";
+	    }
+	}
+	print " },\n";
+    }
+    print "};\n\n";
+}
+
+# print the header of the chartable.c file
+sub printheader {
+    my ($maps, $charsets) = @_;
+
+    print <<EOF;
+/* This file is generated by mkchartable.pl with the following arguments
+ *
+EOF
+    foreach my $map (@$maps) {
+	my $sha1 = getsha1($map);
+	print " * map:     $sha1 $map\n";
+    }
+    foreach my $charset (@$charsets) {
+	my $sha1 = getsha1($charset);
+	print " * charset: $sha1 $charset\n";
+    }
+    print <<EOF;
+ */
+
+#include "chartable.h"
+
+EOF
+}
+
+# print the lookup table for charactersets at the end
+# of the chartable.c file.
+sub printlookup {
+    print <<EOF;
+
+/*
+ * Mapping of character sets to tables
+ */
+
+const struct charset chartables_charset_table[] = {
+    { "us-ascii", chartables_us_ascii },	/* US-ASCII must be charset number 0 */
+    { "utf-8", 0 }, /* handled directly */
+    { "utf-7", 0 }, /* handled directly */
+    { "iso-8859-1", chartables_iso_8859_1 },
+    { "iso-8859-2", chartables_iso_8859_2 },
+    { "iso-8859-3", chartables_iso_8859_3 },
+    { "iso-8859-4", chartables_iso_8859_4 },
+    { "iso-8859-5", chartables_iso_8859_5 },
+    { "iso-8859-6", chartables_iso_8859_6 },
+    { "iso-8859-7", chartables_iso_8859_7 },
+    { "iso-8859-8", chartables_iso_8859_8 },
+    { "iso-8859-9", chartables_iso_8859_9 },
+    { "koi8-r", chartables_koi8_r },
+    { "iso-2022-jp", chartables_iso_2022_jp },
+    { "iso-2022-kr", chartables_iso_2022_kr },
+    { "gb2312", chartables_gb2312 },
+    { "big5", chartables_big5 },
+    /* Compatibility names */
+    { "unicode-1-1-utf-7", 0 }, /* handled directly */
+    { "unicode-2-0-utf-7", 0 }, /* handled directly */
+    { "x-unicode-2-0-utf-7", 0 }, /* handled directly */
+    /* End Compatibility Names */
+    { "iso-8859-15", chartables_iso_8859_15 },
+    { "windows-1252", chartables_windows_1252 },
+    { "windows-1256", chartables_windows_1256 },
+    { "windows-1250", chartables_windows_1250 },
+    { "windows-1251", chartables_windows_1251 },
+    { "windows-1255", chartables_windows_1255 },
+    /* New character sets should only be added to end so that
+     * cache files stay with valid information */
+};
+
+const int chartables_num_charsets = (sizeof(chartables_charset_table)/sizeof(*chartables_charset_table));
+EOF
+}
+
+# calculate the sha1 of a file
+sub getsha1 {
+    my $file = shift;
+    my $fh = IO::File->new($file, 'r') || return "<none>";
+    my $digest = Digest::SHA1->new();
+    $digest->addfile($fh);
+    return $digest->hexdigest();
+}
+
+__END__
-- 
1.5.6.5

