From c167a99d5a382c4f2333ab29002af9b4d6722bb2 Mon Sep 17 00:00:00 2001 From: Bron Gondwana Date: Mon, 23 Mar 2009 13:48:05 +1100 Subject: [PATCH] Complete rewrite of charset handling, using Perl --- imap/imapd.c | 72 +-- lib/Makefile.in | 11 +- lib/charset.c | 1904 +++++++++++++++++++++------------------------ lib/charset.h | 29 +- lib/charset/iso-2022-jp.t | 33 +- lib/charset/iso-2022-kr.t | 12 +- lib/chartable.h | 27 +- lib/mkchartable.c | 975 ----------------------- lib/mkchartable.pl | 531 +++++++++++++ 9 files changed, 1487 insertions(+), 2107 deletions(-) delete mode 100644 lib/mkchartable.c create mode 100644 lib/mkchartable.pl diff --git a/imap/imapd.c b/imap/imapd.c index 8acb36e..074d1ed 100644 --- a/imap/imapd.c +++ b/imap/imapd.c @@ -7608,26 +7608,16 @@ int parsecharset; c = getastring(imapd_in, imapd_out, &arg); if (c == EOF) goto missingarg; str = charset_convert(arg.s, *charset, NULL, 0); - if (strchr(str, EMPTY)) { - /* Force failure */ - searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); - } - else { - appendstrlistpat(&searchargs->bcc, str); - } + if (str) appendstrlistpat(&searchargs->bcc, str); + else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); } else if (!strcmp(criteria.s, "body")) { if (c != ' ') goto missingarg; c = getastring(imapd_in, imapd_out, &arg); if (c == EOF) goto missingarg; str = charset_convert(arg.s, *charset, NULL, 0); - if (strchr(str, EMPTY)) { - /* Force failure */ - searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); - } - else { - appendstrlistpat(&searchargs->body, str); - } + if (str) appendstrlistpat(&searchargs->body, str); + else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); } else goto badcri; break; @@ -7638,13 +7628,8 @@ int parsecharset; c = getastring(imapd_in, imapd_out, &arg); if (c == EOF) goto missingarg; str = charset_convert(arg.s, *charset, NULL, 0); - if (strchr(str, EMPTY)) { - /* Force failure */ - searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); - } - else { - appendstrlistpat(&searchargs->cc, str); - } + if (str) appendstrlistpat(&searchargs->cc, str); + else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); } else if (parsecharset && !strcmp(criteria.s, "charset")) { if (c != ' ') goto missingarg; @@ -7675,13 +7660,8 @@ int parsecharset; c = getastring(imapd_in, imapd_out, &arg); if (c == EOF) goto missingarg; str = charset_convert(arg.s, *charset, NULL, 0); - if (strchr(str, EMPTY)) { - /* Force failure */ - searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); - } - else { - appendstrlistpat(&searchargs->from, str); - } + if (str) appendstrlistpat(&searchargs->from, str); + else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); } else goto badcri; break; @@ -7732,13 +7712,8 @@ int parsecharset; c = getastring(imapd_in, imapd_out, &arg); if (c == EOF) goto missingarg; str = charset_convert(arg.s, *charset, NULL, 0); - if (strchr(str, EMPTY)) { - /* Force failure */ - searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); - } - else { - appendstrlistpat(patlist, str); - } + if (str) appendstrlistpat(patlist, str); + else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); } else goto badcri; break; @@ -7909,13 +7884,8 @@ int parsecharset; c = getastring(imapd_in, imapd_out, &arg); if (c == EOF) goto missingarg; str = charset_convert(arg.s, *charset, NULL, 0); - if (strchr(str, EMPTY)) { - /* Force failure */ - searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); - } - else { - appendstrlistpat(&searchargs->subject, str); - } + if (str) appendstrlistpat(&searchargs->subject, str); + else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); } else goto badcri; break; @@ -7926,26 +7896,16 @@ int parsecharset; c = getastring(imapd_in, imapd_out, &arg); if (c == EOF) goto missingarg; str = charset_convert(arg.s, *charset, NULL, 0); - if (strchr(str, EMPTY)) { - /* Force failure */ - searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); - } - else { - appendstrlistpat(&searchargs->to, str); - } + if (str) appendstrlistpat(&searchargs->to, str); + else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); } else if (!strcmp(criteria.s, "text")) { if (c != ' ') goto missingarg; c = getastring(imapd_in, imapd_out, &arg); if (c == EOF) goto missingarg; str = charset_convert(arg.s, *charset, NULL, 0); - if (strchr(str, EMPTY)) { - /* Force failure */ - searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); - } - else { - appendstrlistpat(&searchargs->text, str); - } + if (str) appendstrlistpat(&searchargs->text, str); + else searchargs->flags = (SEARCH_RECENT_SET|SEARCH_RECENT_UNSET); } else goto badcri; break; diff --git a/lib/Makefile.in b/lib/Makefile.in index 07f8e59..d5adb77 100644 --- a/lib/Makefile.in +++ b/lib/Makefile.in @@ -141,24 +141,19 @@ imapopts.c: imapoptions $(srcdir)/../tools/config2header imapopts.h: imapopts.c -chartable.c: mkchartable +chartable.c: mkchartable.pl @echo "### Building chartables..." rm -f chartable.c - ./mkchartable \ + perl ./mkchartable.pl \ -m $(srcdir)/charset/unifix.txt \ -m $(srcdir)/charset/unidata2.txt \ $(srcdir)/charset/*.t \ > chartable.c \ || (rm -f chartable.c && exit 1) @echo "### Done building chartables." -# ./mkchartable -m $(srcdir)/charset/unicode.map $(srcdir)/charset/*.t >x-chartables.h -# mv x-chartables.h chartables.h - -mkchartable: mkchartable.o xstrlcpy.o xstrlcat.o xmalloc.o assert.o - $(CC) $(LDFLAGS) -o mkchartable mkchartable.o xstrlcpy.o xstrlcat.o xmalloc.o assert.o clean: - rm -f *.o *.a chartable.c Makefile.bak mkchartable makedepend.log \ + rm -f *.o *.a chartable.c Makefile.bak makedepend.log \ $(BUILTSOURCES) distclean: clean diff --git a/lib/charset.c b/lib/charset.c index a83a625..65fda3d 100644 --- a/lib/charset.c +++ b/lib/charset.c @@ -53,115 +53,72 @@ #include "chartable.h" #include "util.h" -extern const unsigned char chartables_long_translations[]; -extern const int charset_max_translation; -extern const unsigned char chartables_unicode_block[256]; -extern const unsigned char chartables_unicode[][256][4]; -extern const unsigned char chartables_us_ascii[][256][4]; +/* unicode canon translations */ +extern const int chartables_translation_multichar[]; +extern const unsigned char chartables_translation_block16[256]; +extern const unsigned char chartables_translation_block8[][256]; +extern const int chartables_translation[][256]; + +/* named character sets */ extern const struct charset chartables_charset_table[]; extern const int chartables_num_charsets; -struct decode_state { - const unsigned char (*curtable)[256][4]; - const unsigned char (*lasttable)[256][4]; - const unsigned char (*initialtable)[256][4]; - unsigned utfcode; - unsigned num_bits; - unsigned b64_value; +struct qp_state { + int isheader; + int bytesleft; + int codepoint; +}; + +struct b64_state { + int bytesleft; + int codepoint; +}; + +struct table_state { + const struct charmap (*curtable)[256]; + const struct charmap (*initialtable)[256]; + int bytesleft; + int codepoint; + int mode; + int num_bits; +}; + +struct canon_state { + int spacemode; + int seenspace; }; -#define START(state,table) \ - ((state).curtable = (state.initialtable) = (table)); \ - ((state).lasttable = NULL); \ - ((state).utfcode = 0); \ - ((state).num_bits = 0); \ - ((state).b64_value = 0); - - -static int xlate(int index, char *to); -static int writeutf8(unsigned utfcode, char *to); - -#define TRANSLATE(state,c,ptr,idx) \ -{ \ - unsigned char _ch; \ - const unsigned char *_translation = (state).curtable[0][(unsigned char)(c) & 0xff]; \ - for (;;) { \ - switch (_ch = *_translation++) { \ - case JSR: \ - (state).lasttable = (state).curtable; \ - /* FALL THROUGH */ \ - case JMP: \ - (state).curtable = ((state).initialtable + \ - (_translation[0]<<8) + (_translation[1])); \ - break; \ - \ - case RET: \ - (state).curtable = (state).lasttable; \ - /* FALL THROUGH */ \ - case END: \ - break; \ -\ - case U7F: \ - (state).b64_value = 0; \ - (state).num_bits = 0; \ - (state).curtable = ((state).initialtable + 1); \ - /* FALL THROUGH */ \ - case U7N: \ - (state).b64_value <<= 6; \ - (state).b64_value += index_64[(unsigned char)(c) & 0xff]; \ - (state).num_bits += 6; \ - if ((state).num_bits >= 16) { \ - (state).num_bits -= 16; \ - (state).utfcode = \ - ((state).b64_value >> (state).num_bits) & 0x7fff; \ - idx += writeutf8((state).utfcode, ptr+idx); \ - } \ - break; \ -\ - case U83: \ - (state).lasttable = (state).curtable; \ - (state).utfcode = (c & 0x0f) << 12; \ - (state).curtable = ((state).initialtable + 1); \ - break; \ -\ - case U83_2: \ - (state).utfcode += (c & 0x3f) << 6; \ - (state).curtable = ((state).initialtable + 2); \ - break; \ -\ - case U83_3: \ - (state).utfcode += (c & 0x03f); \ - (state).curtable = (state).initialtable; \ - idx += writeutf8((state).utfcode, ptr+idx); \ - break; \ - \ - case XLT: \ - idx += xlate((_translation[0]<<8) + (_translation[1]), ptr+idx); \ - _translation += 2; /* next translation is a RET or END */ \ - continue; \ - \ - default: \ - (ptr)[(idx)++] = _ch; \ - continue; \ - } \ - break; \ - } \ -} -/* for a comp_pat, ascii[0x80] == 0 if there are any non-ascii characters - in the pattern */ struct comp_pat_s { - int pat[256]; /* boyer-moore skip table */ - int ascii[256]; /* case-mapped version of table */ - int patlen; - int patlastchar; /* last character in the pattern */ - int patotherlastchar; /* case-flip of the last character */ + int max_start; + size_t patlen; }; -#define PATASCII(pat) (pat+256) -#define PATLEN(pat) ((pat)[512]) -#define PATLASTCHAR(pat) ((pat)[513]) /* last character in the pattern */ -#define PATOTHERLASTCHAR(pat) ((pat)[514]) /* case-flip of the pattern */ -#define PATSIZE 515 +struct search_state { + size_t *starts; + int max_start; + int havematch; + unsigned char *substr; + size_t patlen; + size_t offset; +}; + +struct buffer_state { + unsigned char *base; + size_t offset; + size_t alloced; +}; + +struct convert_rock; + +typedef void convertproc_t(struct convert_rock *rock, int c); +typedef void freeconvert_t(struct convert_rock *rock); + +struct convert_rock { + convertproc_t *f; + freeconvert_t *cleanup; + struct convert_rock *next; + void *state; +}; #define GROWSIZE 100 @@ -212,7 +169,557 @@ static const char index_64[256] = { }; #define CHAR64(c) (index_64[(unsigned char)(c)]) -#define USASCII(c) (chartables_us_ascii[0][(unsigned char)(c)][0]) +static inline void convert_putc(struct convert_rock *rock, int c) +{ + rock->f(rock, c); +} + +void convert_cat(struct convert_rock *rock, const char *s) +{ + while (*s) { + convert_putc(rock, (unsigned char)*s); + s++; + } +} + +void convert_catn(struct convert_rock *rock, const char *s, size_t len) +{ + while (len-- > 0) { + convert_putc(rock, (unsigned char)*s); + s++; + } +} + +/* convertproc_t conversion functions */ + +void qp2byte(struct convert_rock *rock, int c) +{ + struct qp_state *s = (struct qp_state *)rock->state; + int val; + + if (s->bytesleft) { + s->bytesleft--; + val = HEXCHAR(c); + if (val == XX) { + /* mark invalid regardless */ + s->codepoint = -1; + return; + } + if (s->codepoint != -1) { + /* don't blat the invalid marker, but still absorb + * the second char */ + s->codepoint = (s->codepoint << 4) + val; + } + if (!s->bytesleft) { + convert_putc(rock->next, s->codepoint & 0xff); + } + return; + } + + /* start an encoded byte */ + if (c == '=') { + s->bytesleft = 2; + s->codepoint = 0; + return; + } + + /* underscores are space in headers */ + if (s->isheader && c == '_') c = ' '; + + convert_putc(rock->next, c); +} + +void b64_2byte(struct convert_rock *rock, int c) +{ + struct b64_state *s = (struct b64_state *)rock->state; + char b = CHAR64(c); + + /* could just be whitespace, ignore it */ + if (b == XX) return; + + switch (s->bytesleft) { + case 0: + s->codepoint = b; + s->bytesleft = 3; + break; + case 3: + convert_putc(rock->next, ((s->codepoint << 2) | (b >> 4)) & 0xff); + s->codepoint = b; + s->bytesleft = 2; + break; + case 2: + convert_putc(rock->next, ((s->codepoint << 4) | (b >> 2)) & 0xff); + s->codepoint = b; + s->bytesleft = 1; + break; + case 1: + convert_putc(rock->next, ((s->codepoint << 6) | b) & 0xff); + s->codepoint = 0; + s->bytesleft = 0; + } +} + +void table2uni(struct convert_rock *rock, int c) +{ + struct table_state *s = (struct table_state *)rock->state; + struct charmap *map = (struct charmap *)&s->curtable[0][c & 0xff]; + + if (c == -1) { /* invalid character propogation */ + convert_putc(rock->next, 0xfffd); + return; + } + + if (map->c) { + convert_putc(rock->next, map->c); + } + + s->curtable = s->initialtable + map->next; +} + +void utf8_2uni(struct convert_rock *rock, int c) +{ + struct table_state *s = (struct table_state *)rock->state; + + if (c == -1) { /* invalid character propogation */ + convert_putc(rock->next, 0xfffd); + return; + } + + if ((c & 0xf8) == 0xf0) { /* 11110xxx */ + /* first of a 4 char sequence */ + s->bytesleft = 3; + s->codepoint = c & 0x07; /* 00000111 */ + } + else if ((c & 0xf0) == 0xe0) { /* 1110xxxx */ + /* first of a 3 char sequence */ + s->bytesleft = 2; + s->codepoint = c & 0x0f; /* 00001111 */ + } + else if ((c & 0xe0) == 0xc0) { /* 110xxxxx */ + /* first of a 2 char sequence */ + s->bytesleft = 1; + s->codepoint = c & 0x1f; /* 00011111 */ + } + else if ((c & 0xc0) == 0x80) { /* 10xxxxxx */ + /* continuation char, handle only if expected */ + if (s->bytesleft > 0) { + s->codepoint = (s->codepoint << 6) + (c & 0x3f); /* 00111111 */ + s->bytesleft--; + if (!s->bytesleft) { + convert_putc(rock->next, s->codepoint); + s->codepoint = 0; + } + } + } + else { /* plain ASCII char */ + convert_putc(rock->next, c); + s->bytesleft = 0; + s->codepoint = 0; + } +} + +void utf7_2uni (struct convert_rock *rock, int c) +{ + struct table_state *s = (struct table_state *)rock->state; + + if (c == -1) { /* invalid character propogation */ + convert_putc(rock->next, 0xfffd); + return; + } + + if (c & 0x80) { /* skip 8-bit chars */ + convert_putc(rock->next, -1); + return; + } + + /* Inside a base64 encoded unicode fragment */ + if (s->mode) { + /* '-' marks the end of a fragment */ + if (c == '-') { + /* special case: sequence +- creates output '+' */ + if (s->mode == 1) + convert_putc(rock->next, '+'); + /* otherwise no output for the '-' */ + s->mode = 0; + s->num_bits = 0; + s->codepoint = 0; + } + /* a normal char drops us out of base64 mode */ + else if (CHAR64(c) == XX) { + /* pass on the char */ + convert_putc(rock->next, c); + /* and switch back to ASCII mode */ + s->mode = 0; + /* XXX: warn if num_bits > 4 or codepoint != 0 */ + s->num_bits = 0; + s->codepoint = 0; + } + /* base64 char - process it into the state machine */ + else { + s->mode = 2; /* we have some content, so don't process special +- */ + /* add the 6 bits of value from this character */ + s->codepoint = (s->codepoint << 6) + CHAR64(c); + s->num_bits += 6; + /* if we've got a full character's worth of bits, send it down + * the line and keep the remainder for the next character */ + if (s->num_bits >= 16) { + s->num_bits -= 16; + convert_putc(rock->next, (s->codepoint >> s->num_bits) & 0x7fff); + s->codepoint &= ((1 << s->num_bits) - 1); /* avoid overflow by trimming */ + } + } + } + + /* regular ASCII mode */ + else { + /* '+' switches to base64 unicode mode */ + if (c == '+') { + s->mode = 1; /* switch mode, but no content processed yet */ + s->codepoint = 0; + s->num_bits = 0; + } + /* regular ASCII char */ + else { + convert_putc(rock->next, c); + } + } +} + +void uni2searchform(struct convert_rock *rock, int c) +{ + struct canon_state *s = (struct canon_state *)rock->state; + int i; + int code; + unsigned char table16, table8; + + table16 = chartables_translation_block16[(c>>16) & 0xff]; + + /* no translations */ + if (table16 == 255) { + convert_putc(rock->next, c); + return; + } + + table8 = chartables_translation_block8[table16][(c>>8) & 0xff]; + + /* no translations */ + if (table8 == 255) { + convert_putc(rock->next, c); + return; + } + + /* use the xlate table */ + code = chartables_translation[table8][c & 0xff]; + + /* case - zero length output */ + if (code == 0) { + return; + } + + /* special case: whitespace */ + if (code == ' ') { + switch (s->spacemode) { + case 0: + return; + + case 1: + if (s->seenspace) + return; + s->seenspace = 1; + break; + /* XXX - anything other than compress or strip? */ + } + } + else + s->seenspace = 0; + + /* case - one character output */ + if (code > 0) { + convert_putc(rock->next, code); + return; + } + + /* case - multiple characters */ + for (i = -code; chartables_translation_multichar[i]; i++) { + /* note: whitespace already stripped from multichar sequences... */ + convert_putc(rock->next, chartables_translation_multichar[i]); + } +} + +void uni2utf8(struct convert_rock *rock, int c) +{ + if (c > 0xffff) { + convert_putc(rock->next, 0xF0 + ((c >> 18) & 0x07)); + convert_putc(rock->next, 0x80 + ((c >> 12) & 0x3f)); + convert_putc(rock->next, 0x80 + ((c >> 6) & 0x3f)); + convert_putc(rock->next, 0x80 + ( c & 0x3f)); + } + else if (c > 0x7ff) { + convert_putc(rock->next, 0xE0 + ((c >> 12) & 0x0f)); + convert_putc(rock->next, 0x80 + ((c >> 6) & 0x3f)); + convert_putc(rock->next, 0x80 + ( c & 0x3f)); + } + else if (c > 0x7f) { + convert_putc(rock->next, 0xC0 + ((c >> 6) & 0x1f)); + convert_putc(rock->next, 0x80 + ( c & 0x3f)); + } + else { + convert_putc(rock->next, c); + } +} + +void byte2search(struct convert_rock *rock, int c) +{ + struct search_state *s = (struct search_state *)rock->state; + int i, cur; + unsigned char b = (unsigned char)c; + + /* check our "in_progress" matches to see if they're still valid */ + for (i = 0, cur = 0; i < s->max_start; i++) { + /* no more active offsets */ + if (s->starts[i] == -1) + break; + + /* if we've passed one that's not ongoing, copy back */ + if (cur < i) { + s->starts[cur] = s->starts[i]; + } + /* check that the substring is still maching */ + if (b == s->substr[s->offset - s->starts[i]]) { + if (s->offset - s->starts[i] == s->patlen - 1) { + /* we're there! */ + s->havematch = 1; + } + else { + /* keep this one, it's ongoing */ + cur++; + } + } + } + /* starting a new one! */ + if (b == s->substr[0]) { + /* have to treat this one specially! */ + if (s->patlen == 1) + s->havematch = 1; + else + s->starts[cur++] = s->offset; + } + /* empty out any others that aren't being kept */ + while (cur < i) s->starts[cur++] = -1; + + /* increment the offset counter */ + s->offset++; +} + +void byte2buffer(struct convert_rock *rock, int c) +{ + struct buffer_state *buf = (struct buffer_state *)rock->state; + + /* make sure we have the space */ + if (buf->offset >= buf->alloced) { + buf->alloced += GROWSIZE; + buf->base = realloc(buf->base, buf->alloced); + } + + buf->base[buf->offset++] = c & 0xff; +} + +/* convert_rock manipulation routines */ + +void table_switch(struct convert_rock *rock, int charset_num) +{ + struct table_state *state = (struct table_state *)rock->state; + + /* wipe any current state */ + memset(state, 0, sizeof(struct table_state)); + + /* it's a table based lookup */ + if (chartables_charset_table[charset_num].table) { + /* set up the initial table */ + state->curtable = state->initialtable + = chartables_charset_table[charset_num].table; + rock->f = table2uni; + } + + /* special case UTF-8 */ + else if (strstr(chartables_charset_table[charset_num].name, "utf-8")) { + rock->f = utf8_2uni; + } + + /* special case UTF-7 */ + else if (strstr(chartables_charset_table[charset_num].name, "utf-7")) { + rock->f = utf7_2uni; + } + + /* should never happen */ + else { + exit(1); + /* do something fatal here! */ + } +} + +/* Extract a cstring from a buffer. NOTE: caller must free the memory + * themselves once this is called. Resets the state. If you don't + * call this function then buffer_free will clean up */ +char *buffer_cstring(struct convert_rock *rock) +{ + struct buffer_state *buf = (struct buffer_state *)rock->state; + char *res; + + /* finish the string */ + if (buf->offset >= buf->alloced) { + buf->alloced++; + buf->base = realloc(buf->base, buf->alloced); + } + buf->base[buf->offset] = '\0'; + + /* copy the pointer out */ + res = buf->base; + + /* clean up the buffer so it frees correctly later */ + buf->base = 0; + buf->alloced = 0; + buf->offset = 0; + + return res; +} + +static inline int search_havematch(struct convert_rock *rock) +{ + struct search_state *s = (struct search_state *)rock->state; + return s->havematch; +} + +/* conversion cleanup routines */ + +void basic_free(struct convert_rock *rock) +{ + if (rock) { + if (rock->state) free(rock->state); + free(rock); + } +} + +void search_free(struct convert_rock *rock) +{ + if (rock && rock->state) { + struct search_state *s = (struct search_state *)rock->state; + if (s->starts) free(s->starts); + } + basic_free(rock); +} + +void buffer_free(struct convert_rock *rock) { + if (rock && rock->state) { + struct buffer_state *buf = (struct buffer_state *)rock->state; + if (buf->base) free(buf->base); + } + basic_free(rock); +} + +void convert_free(struct convert_rock *rock) { + struct convert_rock *next; + while (rock) { + next = rock->next; + if (rock->cleanup) + rock->cleanup(rock); + else + basic_free(rock); + rock = next; + } +} + +/* converter initialisation routines */ + +struct convert_rock *qp_init(int isheader, struct convert_rock *next) +{ + struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock)); + struct qp_state *s = xzmalloc(sizeof(struct qp_state)); + s->isheader = isheader; + rock->state = (void *)s; + rock->f = qp2byte; + rock->next = next; + return rock; +} + +struct convert_rock *b64_init(struct convert_rock *next) +{ + struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock)); + rock->state = xzmalloc(sizeof(struct b64_state)); + rock->f = b64_2byte; + rock->next = next; + return rock; +} + +struct convert_rock *canon_init(int spacemode, struct convert_rock *next) +{ + struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock)); + struct canon_state *s = xzmalloc(sizeof(struct canon_state)); + s->spacemode = spacemode; + rock->f = uni2searchform; + rock->state = s; + rock->next = next; + return rock; +} + +struct convert_rock *uni_init(struct convert_rock *next) +{ + struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock)); + rock->f = uni2utf8; + rock->next = next; + return rock; +} + +struct convert_rock *table_init(int charset_num, struct convert_rock *next) +{ + struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock)); + rock->state = xzmalloc(sizeof(struct table_state)); + rock->next = next; + table_switch(rock, charset_num); + return rock; +} + +struct convert_rock *search_init(const char *substr, comp_pat *pat) { + struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock)); + struct search_state *s = xzmalloc(sizeof(struct search_state)); + struct comp_pat_s *p = (struct comp_pat_s *)pat; + int i; + + /* copy in tracking vars */ + s->max_start = p->max_start; + s->patlen = p->patlen; + s->substr = (unsigned char *)substr; + + /* allocate tracking space and initialise to "no match" */ + s->starts = xmalloc(s->max_start * sizeof(int)); + for (i = 0; i < s->max_start; i++) { + s->starts[i] = -1; + } + + /* set up the rock */ + rock->f = byte2search; + rock->cleanup = search_free; + rock->state = (void *)s; + + return rock; +} + +struct convert_rock *buffer_init(char *str, int len) +{ + struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock)); + struct buffer_state *buf = xzmalloc(sizeof(struct buffer_state)); + + buf->base = str; + buf->alloced = len; + + rock->f = byte2buffer; + rock->cleanup = buffer_free; + rock->state = (void *)buf; + + return rock; +} + +/* API */ /* * Lookup the character set 'name'. Returns the character set number @@ -222,9 +729,11 @@ int charset_lookupname(const char *name) { int i; - for (i=0; i= chartables_num_charsets) return xstrdup(EMPTY_STRING); + if (charset < 0 || charset >= chartables_num_charsets) + return 0; - START(state,chartables_charset_table[charset].table); - - if (!alloced) { - alloced = GROWSIZE; - retval = xmalloc(alloced); - } - *retval = '\0'; + /* set up the conversion path */ + tobuffer = buffer_init(buf, bufsz); + input = uni_init(tobuffer); + input = canon_init(1, input); + input = table_init(charset, input); - while (*s) { - if (pos + charset_max_translation >= alloced) { - alloced += GROWSIZE; - retval = xrealloc(retval, alloced); - } - TRANSLATE(state, *s, retval, pos); - s++; + /* do the conversion */ + convert_cat(input, s); + + /* extract the result */ + res = buffer_cstring(tobuffer); + + /* clean up */ + convert_free(input); + + return res; +} + +/* Convert from a given charset and encoding into utf8 */ +char *charset_to_utf8(const char *msg_base, size_t len, int charset, int encoding) +{ + struct convert_rock *input, *tobuffer; + char *res; + + /* Initialize character set mapping */ + if (charset < 0 || charset >= chartables_num_charsets) + return 0; + + /* check for trivial search */ + if (len == 0) + return xstrdup(""); + + /* set up the conversion path */ + tobuffer = buffer_init(0, 0); + input = uni_init(tobuffer); + input = table_init(charset, input); + + /* choose encoding extraction if needed */ + switch (encoding) { + case ENCODING_NONE: + break; + + case ENCODING_QP: + input = qp_init(0, input); + break; + + case ENCODING_BASE64: + input = b64_init(input); + /* XXX have to have nl-mapping base64 in order to + * properly count \n as 2 raw characters + */ + break; + + default: + /* Don't know encoding--nothing can match */ + convert_free(input); + return 0; } - retval[pos] = '\0'; - return retval; + convert_catn(input, msg_base, len); + res = buffer_cstring(tobuffer); + convert_free(input); + + return res; } -/* - * Decode MIME strings (per RFC 2047) in 's'. It writes the decoded - * string to 'retval', calling realloc() as needed. (Thus retval may - * be NULL.) Returns retval, contining 's' in canonical searching form. - */ -char *charset_decode_mimeheader(const char *s, char *retval, int alloced) +void mimeheader_cat(struct convert_rock *target, const char *s) { + struct convert_rock *input; int eatspace = 0; const char *start, *endcharset, *encoding, *end; const char *p; - int i, c, c1, c2, c3, c4; - struct decode_state state; + int i, c, c1, c2, c3, c4, charset; int pos = 0; int len; + char *res; if (!s) return 0; - START(state,chartables_charset_table[0].table); /* just for msvc lint */ + /* set up the conversion path */ + input = table_init(0, target); + start = s; while ((start = (const char*) strchr(start, '=')) != 0) { start++; @@ -308,88 +860,40 @@ char *charset_decode_mimeheader(const char *s, char *retval, int alloced) } if (!eatspace) { len = start - s - 1; - if (pos + len >= alloced) { - alloced += len + GROWSIZE; - retval = xrealloc(retval, alloced); - } - while (len--) { - c = USASCII(*s); - if (c != END) retval[pos++] = (char)c; - s++; - } + table_switch(input, 0); /* US_ASCII */ + convert_catn(input, s, len); } /* * Get the 1522-word's character set */ start++; - for (i=0; i= alloced) { - alloced += 2 + GROWSIZE; - retval = xrealloc(retval, alloced); - } - strcpy(retval+pos, EMPTY_STRING); - pos += 1; - } - else if (encoding[1] == 'q' || encoding[1] == 'Q') { - /* Decode 'Q' encoding */ - p = encoding+3; - while (p < end) { - c = *p++; - if (c == '=') { - c = HEXCHAR(*p); - p++; - i = HEXCHAR(*p); - p++; - if (c == XX || i == XX) { - c = '\0'; - } - else { - c = (char)((c << 4) + i); - } - } - else if (c == '_') c = ' '; - - if (pos + charset_max_translation >= alloced) { - alloced += GROWSIZE; - retval = xrealloc(retval, alloced); - } - TRANSLATE(state, c, retval, pos); - } + convert_putc(input, -1); /* unknown character */ } else { - /* Decode 'B' encoding */ - p = encoding+3; - while (p < end) { - if (pos + charset_max_translation*3 >= alloced) { - alloced += GROWSIZE; - retval = xrealloc(retval, alloced); - } - c1 = CHAR64(p[0]); - if (c1 == XX) break; - c2 = CHAR64(p[1]); - if (c2 == XX) break; - TRANSLATE(state,((c1<<2) | ((c2&0x30)>>4)), retval, pos); - - c3 = CHAR64(p[2]); - if (c3 == XX) break; - TRANSLATE(state,(((c2&0XF)<<4) | ((c3&0x3C)>>2)), retval, pos); - - c4 = CHAR64(p[3]); - if (c4 == XX) break; - TRANSLATE(state,(((c3&0x03) <<6) | c4), retval, pos); - - p += 4; + struct convert_rock *extract; + /* choose decoder */ + if (encoding[1] == 'q' || encoding[1] == 'Q') { + extract = qp_init(1, input); } + else { + extract = b64_init(input); + } + /* convert */ + p = encoding+3; + convert_catn(extract, p, end - p); + /* clean up */ + basic_free(extract); } /* Prepare for the next iteration */ @@ -398,164 +902,136 @@ char *charset_decode_mimeheader(const char *s, char *retval, int alloced) } /* Copy over the tail part of the input string */ - len = strlen(s); - if (pos + len >= alloced) { - alloced += len + 1; - retval = xrealloc(retval, alloced); - } - while (len--) { - c = USASCII(*s); - if (c != END) retval[pos++] = (char)c; - s++; + if (*s) { + table_switch(input, 0); /* US_ASCII */ + convert_cat(input, s); } - retval[pos] = '\0'; - return retval; -} -/* - * Compile the pattern 's' and return a pointer to the compiled form - */ -comp_pat *charset_compilepat(const char *s) -{ - comp_pat *pat; - int i, c, len; - - pat = (comp_pat *)xmalloc(PATSIZE * sizeof(comp_pat)); - PATLEN(pat) = len = strlen(s); - if (len) { - PATLASTCHAR(pat) = c = (unsigned char)s[len-1]; - if (isupper(c)) PATOTHERLASTCHAR(pat) = TOLOWER(c); - else if (islower(c)) PATOTHERLASTCHAR(pat) = TOUPPER(c); - else PATOTHERLASTCHAR(pat) = c; - } - for (i=0; i<512; i++) pat[i] = len; - for (i=0; i= 0 && s[i] == substr[j]) { - i--; - j--; - } - if (j < 0) return 1; /* Found match */ - if (pat[(unsigned char)s[i]] == large || - pat[(unsigned char)s[i]] < PATLEN(pat)-j) { - i += PATLEN(pat) - j; - } - else { - i += pat[(unsigned char)s[i]]; - } - } -} + mimeheader_cat(input, s); + + res = buffer_cstring(tobuffer); -static int xlate(int index, char *to) { - const unsigned char *from = chartables_long_translations + index; - int i = 0; + convert_free(input); - while ((*to++ = *from++) != END) i++; - return i; + return res; } -static int writeutf8(unsigned utfcode, char *to) +int charset_search_mimeheader(const char *substr, comp_pat *pat, + const char *s, int searchform) { - int table = chartables_unicode_block[utfcode>>8]; - int idx = 0; - struct decode_state state; - - if (table == 255) { - /* No translations in this block */ - if (utfcode > 0x7ff) { - to[0] = (char)(0xE0 + (utfcode >> 12)); - to[1] = (char)(0x80 + ((utfcode >> 6) & 0x3f)); - to[2] = (char)(0x80 + (utfcode & 0x3f)); - return 3; - } - if (utfcode > 0x7f) { - to[0] = (char)(0xC0 + (utfcode >> 6)); - to[1] = (char)(0x80 + (utfcode & 0x3f)); - return 2; - } - to[0] = (char)utfcode; - return 1; - } + struct convert_rock *input, *tosearch; + int res; - START(state, chartables_unicode + table); - TRANSLATE(state, (utfcode & 0xff), to, idx); + tosearch = search_init(substr, pat); + input = uni_init(tosearch); + if (searchform) input = canon_init(1, input); - return idx; + mimeheader_cat(input, s); + + res = search_havematch(tosearch); + convert_free(input); + + return res; +} + +/* Compile a search pattern for later comparison. We just count + * how long the string is, and how many times the first character + * occurs. Later optimisation could reduce the max_start by + * deeper analysis of the possible paths through the string, but + * this is a good absolute maximum, and it just means a few more + * bytes get allocated... */ +comp_pat *charset_compilepat(const char *s) +{ + struct comp_pat_s *pat = xzmalloc(sizeof(struct comp_pat_s)); + const char *p = s; + /* count occurances */ + while (*p) { + if (*p == *s) pat->max_start++; + pat->patlen++; + p++; + } + return (comp_pat *)pat; } /* - * The various charset_searchfile() helper functions + * Free the compiled pattern 'pat' */ -struct input_state; -typedef int rawproc_t(struct input_state *state, char *buf, int size); - -static int charset_readconvert(struct input_state *state, char *buf, int size); -static rawproc_t charset_readplain; -static rawproc_t charset_readplain_nospc; -static rawproc_t charset_readmapnl; -static rawproc_t charset_readqp; -static rawproc_t charset_readqp_nospc; -static rawproc_t charset_readqpmapnl; -static rawproc_t charset_readbase64; -static rawproc_t charset_readbase64_nospc; +void charset_freepat(comp_pat *pat) +{ + free((struct comp_pat_s *)pat); +} /* - * State for the various charset_searchfile() helper functions + * Search for the string 'substr', with compiled pattern 'pat' + * in the string 's', with length 'len'. Return nonzero if match + * + * Uses the to_search target directly. Assumes 's' is already + * in search normal form (i.e. from a cache file) */ -struct input_state { - rawproc_t *rawproc; /* Function to read and transfer-decode data */ - const char *rawbase; /* Location in mapped file of raw data */ - int rawlen; /* # bytes raw data left to read from file */ - char decodebuf[2048]; /* Buffer of data deocded, but not converted - * into canonical searching form */ - int decodestart, decodeleft; /* Location/count of decoded data */ - struct decode_state decodestate; /* Charset state to convert decoded data - * into canonical searching form */ -}; +int charset_searchstring(const char *substr, comp_pat *pat, + const char *s, size_t len) +{ + struct convert_rock *tosearch; + int res, *found; + + /* set up the search handler */ + tosearch = search_init(substr, pat); + + /* feed the handler */ + while (len-- > 0) { + convert_putc(tosearch, (unsigned char)*s++); + if (search_havematch(tosearch)) break; /* shortcut if there's a match */ + } + /* copy the value */ + res = search_havematch(tosearch); + + /* clean up */ + search_free(tosearch); + + return res; +} /* * Search for the string 'substr' in the next 'len' bytes of @@ -566,139 +1042,37 @@ struct input_state { * Returns nonzero iff the string was found. */ int charset_searchfile(const char *substr, comp_pat *pat, - const char *msg_base, int mapnl, int len, int charset, int encoding) + const char *msg_base, int mapnl, size_t len, int charset, + int encoding) { - int substrlen = PATLEN(pat); - char *buf, smallbuf[2048]; - int bufsize; - int n; - int i, j, large; - struct input_state state; - + struct convert_rock *input, *tosearch; + int i, *found, res; + /* Initialize character set mapping */ - if (charset < 0 || charset >= chartables_num_charsets) return 0; - START(state.decodestate, chartables_charset_table[charset].table); - state.decodeleft = 0; + if (charset < 0 || charset >= chartables_num_charsets) + return 0; /* check for trivial search */ - if (substrlen == 0) return 1; - - /* - * Select buffer to hold canonical searching fomat data to - * search - */ - if (substrlen < (int) sizeof(smallbuf)/2) { - bufsize = sizeof(smallbuf); - buf = smallbuf; - } - else { - bufsize = substrlen+sizeof(smallbuf); - buf = xmalloc(bufsize); - } - - /* Optimized searching of us-ascii, using boyer-moore */ - if (charset == 0) { - /* Initialize transfer-decoding */ - state.rawbase = msg_base; - state.rawlen = len; - /* don't need to special case mapnl since all such chars will - be ignored, anyway */ - switch (encoding) { - case ENCODING_NONE: - state.rawproc = charset_readplain_nospc; - break; - - case ENCODING_QP: - state.rawproc = charset_readqp_nospc; - break; - - case ENCODING_BASE64: - state.rawproc = charset_readbase64_nospc; - /* XXX have to have nl-mapping base64 in order to - * properly count \n as 2 raw characters - */ - break; - - default: - /* Don't know encoding--nothing can match */ - return 0; - } - - if (PATASCII(pat)[0x80] == 0) { - /* 8-bit chars in pattern--search must fail */ - if (buf != smallbuf) free(buf); - return 0; - } - - n = (*state.rawproc)(&state, buf, bufsize); - if (n < substrlen) { - if (buf != smallbuf) free(buf); - return 0; - } - i = substrlen - 1; - PATASCII(pat)[PATLASTCHAR(pat)] = - PATASCII(pat)[PATOTHERLASTCHAR(pat)] = large = bufsize + i + 2; - - for (;;) { - /* Inner loop -- scan until last char match or end of buffer */ - while (i < n) { - i += PATASCII(pat)[(unsigned char)buf[i]]; - } - - /* End of buffer */ - if (i < large) { - /* Read in more stuff */ - j = i-n; - strncpy(buf, buf+i-(substrlen-1), substrlen-1-j); - n = (*state.rawproc)(&state, buf+substrlen-1-j, bufsize-substrlen+1+j); - i = substrlen-1; - if (n > 0) { - n += i-j; - continue; - } - if (buf != smallbuf) free(buf); - return 0; - } - - /* Last char match--back up and do compare */ - i -= large + 1; - j = PATLEN(pat) - 2; - while (j >= 0 && TOLOWER(buf[i]) == TOLOWER(substr[j])) { - i--; - j--; - } - if (j < 0) { - /* Found match */ - if (buf != smallbuf) free(buf); - return 1; - } - if (PATASCII(pat)[(unsigned char)buf[i]] == large || - PATASCII(pat)[(unsigned char)buf[i]] < PATLEN(pat)-j) { - i += PATLEN(pat) - j; - } - else { - i += PATASCII(pat)[(unsigned char)buf[i]]; - } - } - /* NOTREACHED */ - } + if (strlen(substr) == 0) + return 1; - /* Do the (generalized) search */ + /* set up the conversion path */ + tosearch = search_init(substr, pat); + input = uni_init(tosearch); + input = canon_init(1, input); + input = table_init(charset, input); - /* Initialize transfer-decoding */ - state.rawbase = msg_base; - state.rawlen = len; + /* choose encoding extraction if needed */ switch (encoding) { case ENCODING_NONE: - state.rawproc = mapnl ? charset_readmapnl : charset_readplain; break; case ENCODING_QP: - state.rawproc = mapnl ? charset_readqpmapnl : charset_readqp; + input = qp_init(0, input); break; case ENCODING_BASE64: - state.rawproc = charset_readbase64; + input = b64_init(input); /* XXX have to have nl-mapping base64 in order to * properly count \n as 2 raw characters */ @@ -706,87 +1080,56 @@ int charset_searchfile(const char *substr, comp_pat *pat, default: /* Don't know encoding--nothing can match */ + convert_free(input); return 0; } - n = charset_readconvert(&state, buf, bufsize); - if (n < substrlen) { - if (buf != smallbuf) free(buf); - return 0; - } - i = substrlen - 1; - pat[PATLASTCHAR(pat)] = large = bufsize + i + 2; - for (;;) { - /* Inner loop -- scan until last char match or end of buffer */ - while (i < n) { - i += pat[(unsigned char)buf[i]]; + /* implement the loop here so we can check on the search each time */ + for (i = 0; i < len; i++) { + if (mapnl && msg_base[i] == '\n') { + convert_putc(input, '\r'); + len--; } + convert_putc(input, msg_base[i]); + if (search_havematch(tosearch)) break; + } - /* End of buffer */ - if (i < large) { - /* Read in more stuff */ - j = i-n; - strncpy(buf, buf+i-(substrlen-1), substrlen-1-j); - n = charset_readconvert(&state, buf+substrlen-1-j, - bufsize-substrlen+1+j); - i = substrlen-1; - if (n > 0) { - n += i-j; - continue; - } - if (buf != smallbuf) free(buf); - return 0; - } + res = search_havematch(tosearch); /* copy before we free it */ - /* Last char match--back up and do compare */ - i -= large + 1; - j = PATLEN(pat) - 2; - while (j >= 0 && buf[i] == substr[j]) { - i--; - j--; - } - if (j < 0) { - /* Found match */ - if (buf != smallbuf) free(buf); - return 1; - } - if (pat[(unsigned char)buf[i]] == large || - pat[(unsigned char)buf[i]] < PATLEN(pat)-j) { - i += PATLEN(pat) - j; - } - else { - i += pat[(unsigned char)buf[i]]; - } - } + convert_free(input); + + return res; } /* This is based on charset_searchfile above. */ int charset_extractfile(index_search_text_receiver_t receiver, - void* rock, int uid, const char *msg_base, int mapnl, int len, int charset, - int encoding) { - char buf[2048]; - int n; - struct input_state state; - + void* rock, int uid, const char *msg_base, int mapnl, size_t len, + int charset, int encoding) +{ + struct convert_rock *input, *tobuffer; + struct buffer_state *out; + int i; + + /* set up the conversion path */ + tobuffer = buffer_init(0, 0); + input = uni_init(tobuffer); + input = canon_init(1, input); + input = table_init(charset, input); + /* Initialize character set mapping */ - if (charset < 0 || charset >= chartables_num_charsets) return 0; - START(state.decodestate, chartables_charset_table[charset].table); - state.decodeleft = 0; + if (charset < 0 || charset >= chartables_num_charsets) + return 0; - /* Initialize transfer-decoding */ - state.rawbase = msg_base; - state.rawlen = len; switch (encoding) { case ENCODING_NONE: - state.rawproc = mapnl ? charset_readmapnl : charset_readplain; break; case ENCODING_QP: - state.rawproc = mapnl ? charset_readqpmapnl : charset_readqp; + input = qp_init(0, input); break; case ENCODING_BASE64: - state.rawproc = charset_readbase64; + input = b64_init(input); /* XXX have to have nl-mapping base64 in order to * properly count \n as 2 raw characters */ @@ -794,50 +1137,33 @@ int charset_extractfile(index_search_text_receiver_t receiver, default: /* Don't know encoding--nothing can match */ + convert_free(input); return 0; } - /* We don't need to do anything tricky. Just read and convert each block of the - text, then hand the converted text down to the receiver. */ - do { - n = charset_readconvert(&state, buf, sizeof(buf)); - if (n > 0) { - receiver(uid, SEARCHINDEX_PART_BODY, - SEARCHINDEX_CMD_APPENDPART, buf, n, rock); - } - } while (n > 0); + /* point to the buffer for easy block sending */ + out = (struct buffer_state *)tobuffer->state; - return 1; -} - -/* - * Helper function to read at most 'size' bytes of converted - * (into canonical searching format) data into 'buf'. Returns - * the number of converted bytes, or 0 for end-of-data. - */ -static int charset_readconvert(struct input_state *state, char *buf, int size) -{ - int retval = 0; + for (i = 0; i < len; i++) { + if (mapnl && msg_base[i] == '\n') { + convert_putc(input, '\r'); + len--; + } + convert_putc(input, msg_base[i]); - if (state->decodeleft && state->decodestart != 0) { - memmove(state->decodebuf, state->decodebuf+state->decodestart, - state->decodeleft); + /* process a block of output every so often */ + if (out->offset > 4096) { + receiver(uid, SEARCHINDEX_PART_BODY, SEARCHINDEX_CMD_APPENDPART, out->base, out->offset, rock); + out->offset = 0; + } + } + if (out->offset) { /* finish it */ + receiver(uid, SEARCHINDEX_PART_BODY, SEARCHINDEX_CMD_APPENDPART, out->base, out->offset, rock); } - state->decodestart = 0; - state->decodeleft += (*state->rawproc)(state, - state->decodebuf+state->decodeleft, - sizeof(state->decodebuf)-state->decodeleft); + convert_free(input); - while (state->decodeleft) { - if (retval + charset_max_translation > size) { - return retval; - } - TRANSLATE(state->decodestate, state->decodebuf[state->decodestart], buf, retval); - state->decodestart++; - state->decodeleft--; - } - return retval; + return 1; } /* @@ -847,14 +1173,10 @@ static int charset_readconvert(struct input_state *state, char *buf, int size) * least size 'alloced'. Returns the number of decoded bytes in * 'outlen'. */ -char *charset_decode_mimebody(const char *msg_base, int len, int encoding, - char **retval, int alloced, int *outlen) +char *charset_decode_mimebody(const char *msg_base, size_t len, int encoding, + char **retval, size_t alloced, size_t *outlen) { - struct input_state state; - - /* Initialize transfer-decoding */ - state.rawbase = msg_base; - state.rawlen = len; + struct convert_rock *input, *tobuffer; switch (encoding) { case ENCODING_NONE: @@ -862,11 +1184,13 @@ char *charset_decode_mimebody(const char *msg_base, int len, int encoding, return (char *) msg_base; case ENCODING_QP: - state.rawproc = charset_readqp; + tobuffer = buffer_init(*retval, alloced); + input = qp_init(0, tobuffer); break; case ENCODING_BASE64: - state.rawproc = charset_readbase64; + tobuffer = buffer_init(*retval, alloced); + input = b64_init(tobuffer); break; default: @@ -874,461 +1198,22 @@ char *charset_decode_mimebody(const char *msg_base, int len, int encoding, return NULL; } - if (alloced < len+1) *retval = xrealloc(*retval, len+1); - *outlen = (*state.rawproc)(&state, *retval, len); - (*retval)[*outlen] = '\0'; - return *retval; -} - -/* - * Helper function to read at most 'size' bytes of trivial - * transfer-decoded data into 'buf'. Returns the number of decoded - * bytes, or 0 for end-of-data. - */ -static int charset_readplain(struct input_state *state, char *buf, int size) -{ - if (size > state->rawlen) size = state->rawlen; - if (!size) return 0; - - memcpy(buf, state->rawbase, size); - state->rawlen -= size; - state->rawbase += size; - - return size; -} - -/* - * Helper function to read at most 'size' bytes of trivial - * transfer-decoded data into 'buf'. Removes any US-ASCII whitespace. - * Returns the number of decoded bytes, or 0 for end-of-data. - */ -static int charset_readplain_nospc(struct input_state *state, - char *buf, int size) -{ - int i; - - for (i = 0; i < size; i++) { - /* remove any whitespace at the beginning of rawbase */ - while (state->rawlen > 0 && USASCII(*state->rawbase) == END) { - state->rawlen--; - state->rawbase++; - } - - if (state->rawlen == 0) break; - - /* copy a char */ - buf[i] = *state->rawbase++; - state->rawlen--; - } - - return i; -} - -/* - * Helper function to read at most 'size' bytes of trivial newline-mapped - * transfer-decoded data into 'buf'. Returns the number of decoded - * bytes, or 0 for end-of-data. - */ -static int charset_readmapnl(struct input_state *state, char *buf, int size) -{ - int retval = 0; - char c; - - while (size && state->rawlen > 0) { - c = *state->rawbase; - if (c == '\n') { - if (size < 2) { - return retval; - } - *buf++ = '\r'; - retval++; - size--; - state->rawlen--; - } - *buf++ = c; - state->rawbase++; - state->rawlen--; - retval++; - size--; - } - return retval; -} - -/* - * Helper function to read at most 'size' bytes of quoted-printable - * transfer-decoded data into 'buf'. Returns the number of decoded - * bytes, or 0 for end-of-data. - */ -static int charset_readqp(struct input_state *state, char *buf, int size) -{ - int retval = 0; - int c, c1, c2; - const char *nextline, *endline; - - nextline = endline = state->rawbase; - - while (size && state->rawlen) { - if (state->rawbase >= nextline) { - /* Ignore trailing whitespace at end of line */ - - nextline = - (const char*) memchr(state->rawbase+1, '\r', state->rawlen-1); - if (!nextline) nextline = state->rawbase + state->rawlen; - endline = nextline; - while (endline > state->rawbase && - (endline[-1] == ' ' || endline[-1] == '\t')) { - endline--; - } - } - if (state->rawbase >= endline) { - state->rawbase += nextline - endline; - state->rawlen -= nextline - endline; - continue; - } - - c = state->rawbase[0]; - if (c == '=') { - if (state->rawlen < 3) { - return retval; - } - c1 = state->rawbase[1]; - c2 = state->rawbase[2]; - state->rawbase += 3; - state->rawlen -= 3; - c1 = HEXCHAR(c1); - c2 = HEXCHAR(c2); - /* Following line also takes care of soft line breaks */ - if (c1 == XX && c2 == XX) continue; - *buf++ = (char)((c1 << 4) + c2); - retval++; - size--; - } - else { - state->rawbase++; - state->rawlen--; - *buf++ = (char)c; - retval++; - size--; - } - } - return retval; -} - -/* - * Helper function to read at most 'size' bytes of quoted-printable - * transfer-decoded data into 'buf'. Returns the number of decoded - * bytes, or 0 for end-of-data. Removes any US-ASCII whitespace. - * Since it just throws out \r's anyway, it's simplier than paying - * attention to them - */ -static int charset_readqp_nospc(struct input_state *state, char *buf, int size) -{ - int retval = 0; - int c, c1, c2; - char dec; - const char *nextline, *endline; - - nextline = endline = state->rawbase; - - while (size && state->rawlen) { - if (state->rawbase >= nextline) { - /* Ignore trailing whitespace at end of line */ - - nextline = - (const char*) memchr(state->rawbase+1, '\n', state->rawlen-1); - if (!nextline) nextline = state->rawbase + state->rawlen; - endline = nextline; - while (endline > state->rawbase && (USASCII(endline[-1]) == END)) { - endline--; - } - } - if (state->rawbase >= endline) { - state->rawbase += nextline - endline; - state->rawlen -= nextline - endline; - continue; - } - - c = state->rawbase[0]; - if (c == '=') { - if (state->rawlen < 3) { - return retval; - } - c1 = state->rawbase[1]; - c2 = state->rawbase[2]; - state->rawbase += 3; - state->rawlen -= 3; - c1 = HEXCHAR(c1); - c2 = HEXCHAR(c2); - /* Following line also takes care of soft line breaks */ - if (c1 == XX && c2 == XX) continue; - dec = (char)((c1 << 4) + c2); - if (USASCII(dec) != END) { - /* non-whitespace, take it */ - *buf++ = (char)((c1 << 4) + c2); - retval++; - size--; - } - } - else { - state->rawbase++; - state->rawlen--; - if (USASCII(c) != END) { - /* non-whitespace, grab it */ - *buf++ = (char)c; - retval++; - size--; - } - } - } - return retval; -} - -/* - * Helper function to read at most 'size' bytes of QP newline-mapped - * transfer-decoded data into 'buf'. Returns the number of decoded - * bytes, or 0 for end-of-data. - */ -static int charset_readqpmapnl(struct input_state *state, char *buf, int size) -{ - int retval = 0; - int c, c1, c2; - const char *nextline, *endline; - - nextline = endline = state->rawbase; - - while (size && state->rawlen > 0) { - if (state->rawbase >= nextline) { - /* Ignore trailing whitespace at end of line */ - - nextline = (const char*) - memchr(state->rawbase+1, '\n', state->rawlen - 1); - if (!nextline) nextline = state->rawbase + state->rawlen; - endline = nextline; - while (endline > state->rawbase && - (endline[-1] == ' ' || endline[-1] == '\t')) { - endline--; - } - } - if (state->rawbase >= endline) { - state->rawbase += nextline - endline; - state->rawlen -= nextline - endline; - continue; - } - - c = state->rawbase[0]; - if (c == '=') { - if (state->rawbase+1 == endline) { - state->rawbase = nextline + 1; - state->rawlen -= 3 + (nextline - endline); - - continue; - } - if (state->rawlen < 3) { - return retval; - } - c1 = state->rawbase[1]; - c2 = state->rawbase[2]; - state->rawbase += 3; - state->rawlen -= 3; - if (c2 == '\n') state->rawlen--; - c1 = HEXCHAR(c1); - c2 = HEXCHAR(c2); - if (c1 == XX && c2 == XX) continue; - *buf++ = (char)((c1 << 4) + c2); - retval++; - size--; - } - else if (c == '\n') { - if (size < 2) { - return retval; - } - state->rawbase++; - state->rawlen -= 2; - *buf++ = '\r'; - *buf++ = '\n'; - retval += 2; - size -= 2; - } - else { - state->rawbase++; - state->rawlen--; - *buf++ = (char)c; - retval++; - size--; - } + convert_catn(input, msg_base, len); + + /* extract the string from the buffer, messy - but we want to + * do it without becoming a cstring or being prematurely freed! */ + { + struct buffer_state *buf = (struct buffer_state *)tobuffer->state; + *retval = buf->base; + *outlen = buf->offset; + buf->base = 0; + buf->alloced = 0; + buf->offset = 0; } - return retval; -} -/* - * Helper function to read at most 'size' bytes of base64 - * transfer-decoded data into 'buf'. Returns the number of decoded - * bytes, or 0 for end-of-data. - */ -static int charset_readbase64(struct input_state *state, char *buf, int size) -{ - int retval = 0; - int c1, c2, c3, c4; - - while (size >= 3 && state->rawlen) { - do { - c1 = *state->rawbase++; - state->rawlen--; - if (c1 == '=') { - state->rawlen = 0; - return retval; - } - } while (state->rawlen && CHAR64(c1) == XX); - if (!state->rawlen) { - return retval; - } - - do { - c2 = *state->rawbase++; - state->rawlen--; - if (c2 == '=') { - state->rawlen = 0; - return retval; - } - } while (state->rawlen && CHAR64(c2) == XX); - if (!state->rawlen) { - return retval; - } + convert_free(input); - do { - c3 = *state->rawbase++; - state->rawlen--; - if (c3 == '=') { - *buf++ = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4)); - retval++; - state->rawlen = 0; - return retval; - } - } while (state->rawlen && CHAR64(c3) == XX); - if (!state->rawlen) { - return retval; - } - - do { - c4 = *state->rawbase++; - state->rawlen--; - if (c4 == '=') { - *buf++ = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4)); - *buf++ = (char)(((CHAR64(c2)&0xf)<<4) | ((CHAR64(c3)&0x3c)>>2)); - retval += 2; - state->rawlen = 0; - return retval; - } - } while (state->rawlen && CHAR64(c4) == XX); - if (CHAR64(c4) == XX) { - return retval; - } - - *buf++ = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4)); - *buf++ = (char)(((CHAR64(c2)&0xf)<<4) | ((CHAR64(c3)&0x3c)>>2)); - *buf++ = (char)(((CHAR64(c3)&0x3)<<6) | CHAR64(c4)); - retval += 3; - size -= 3; - } - return retval; -} - -/* - * Helper function to read at most 'size' bytes of base64 - * transfer-decoded data into 'buf'. Returns the number of decoded - * bytes, or 0 for end-of-data. Removes any US-ASCII whitespace. - */ -static int charset_readbase64_nospc(struct input_state *state, - char *buf, int size) -{ - int retval = 0; - int c1, c2, c3, c4; - char dec; - - while (size >= 3 && state->rawlen) { - do { - c1 = *state->rawbase++; - state->rawlen--; - if (c1 == '=') { - state->rawlen = 0; - return retval; - } - } while (state->rawlen && CHAR64(c1) == XX); - if (!state->rawlen) { - return retval; - } - - do { - c2 = *state->rawbase++; - state->rawlen--; - if (c2 == '=') { - state->rawlen = 0; - return retval; - } - } while (state->rawlen && CHAR64(c2) == XX); - if (!state->rawlen) { - return retval; - } - - do { - c3 = *state->rawbase++; - state->rawlen--; - if (c3 == '=') { - dec = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4)); - if (USASCII(dec) != END) { - *buf++ = dec; - retval++; - } - state->rawlen = 0; - return retval; - } - } while (state->rawlen && CHAR64(c3) == XX); - if (!state->rawlen) { - return retval; - } - - do { - c4 = *state->rawbase++; - state->rawlen--; - if (c4 == '=') { - dec = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4)); - if (USASCII(dec) != END) { - *buf++ = dec; - retval++; - } - dec = (char)(((CHAR64(c2)&0xf)<<4) | ((CHAR64(c3)&0x3c)>>2)); - if (USASCII(dec) != END) { - *buf++ = dec; - retval++; - } - state->rawlen = 0; - return retval; - } - } while (state->rawlen && CHAR64(c4) == XX); - if (CHAR64(c4) == XX) { - return retval; - } - - dec = (char)((CHAR64(c1)<<2) | ((CHAR64(c2)&0x30)>>4)); - if (USASCII(dec) != END) { - *buf++ = dec; - retval++; - size--; - } - dec = (char)(((CHAR64(c2)&0xf)<<4) | ((CHAR64(c3)&0x3c)>>2)); - if (USASCII(dec) != END) { - *buf++ = dec; - retval++; - size--; - } - dec = (char)(((CHAR64(c3)&0x3)<<6) | CHAR64(c4)); - if (USASCII(dec) != END) { - *buf++ = dec; - retval++; - size--; - } - } - return retval; + return *retval; } /* @@ -1345,8 +1230,8 @@ static int charset_readbase64_nospc(struct input_state *state, static char base_64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; -char *charset_encode_mimebody(const char *msg_base, int len, - char *retval, int *outlen, int *outlines) +char *charset_encode_mimebody(const char *msg_base, size_t len, + char *retval, size_t *outlen, int *outlines) { const unsigned char *s; unsigned char s0, s1, s2; @@ -1403,3 +1288,4 @@ char *charset_encode_mimebody(const char *msg_base, int len, return (b64_len ? retval : NULL); } + diff --git a/lib/charset.h b/lib/charset.h index 573975a..4e495e8 100644 --- a/lib/charset.h +++ b/lib/charset.h @@ -44,10 +44,6 @@ #ifndef INCLUDED_CHARSET_H #define INCLUDED_CHARSET_H -/* Marker to indicate characters that don't map to anything */ -#define EMPTY 'X' -#define EMPTY_STRING "X" - #define ENCODING_NONE 0 #define ENCODING_QP 1 #define ENCODING_BASE64 2 @@ -60,22 +56,27 @@ typedef int charset_index; /* ensure up to MAXTRANSLATION times expansion into buf */ extern char *charset_convert(const char *s, charset_index charset, char *buf, - int bufsz); -extern char *charset_decode_mimeheader(const char *s, char *buf, int bufsz); + size_t bufsz); +extern char *charset_decode_mimeheader(const char *s, char *buf, + size_t bufsz); +extern char *charset_parse_mimeheader(const char *s); extern charset_index charset_lookupname(const char *name); extern comp_pat *charset_compilepat(const char *s); extern void charset_freepat(comp_pat *pat); extern int charset_searchstring(const char *substr, comp_pat *pat, - const char *s, int len); + const char *s, size_t len); extern int charset_searchfile(const char *substr, comp_pat *pat, - const char *msg_base, int mapnl, int len, + const char *msg_base, int mapnl, size_t len, charset_index charset, int encoding); -extern char *charset_decode_mimebody(const char *msg_base, int len, - int encoding, char **retval, int alloced, - int *outlen); -extern char *charset_encode_mimebody(const char *msg_base, int len, - char *retval, int *outlen, int *outlines); +extern char *charset_decode_mimebody(const char *msg_base, size_t len, + int encoding, char **retval, size_t alloced, + size_t *outlen); +extern char *charset_encode_mimebody(const char *msg_base, size_t len, + char *retval, size_t *outlen, + int *outlines); +extern char *charset_to_utf8(const char *msg_base, size_t len, charset_index charset, int encoding); +extern int charset_search_mimeheader(const char *substr, comp_pat *pat, const char *s, int searchform); /* Definitions for charset_extractfile */ @@ -121,7 +122,7 @@ typedef void index_search_text_receiver_t(int UID, int part, int cmds, by index_getsearchtextmsg to extract the MIME body parts. */ extern int charset_extractfile(index_search_text_receiver_t receiver, void* rock, int uid, const char *msg_base, - int mapnl, int len, charset_index charset, + int mapnl, size_t len, charset_index charset, int encoding); #endif /* INCLUDED_CHARSET_H */ diff --git a/lib/charset/iso-2022-jp.t b/lib/charset/iso-2022-jp.t index 8d1a161..8c12572 100644 --- a/lib/charset/iso-2022-jp.t +++ b/lib/charset/iso-2022-jp.t @@ -67,7 +67,7 @@ 18 0018 CANCEL (CAN) 19 0019 END OF MEDIUM (EM) 1A 001a SUBSTITUTE (SUB) -1B >ESC +# 1B >ESC 1C 001c FILE SEPARATOR (IS4) 1D 001d GROUP SEPARATOR (IS3) 1E 001e RECORD SEPARATOR (IS2) @@ -169,17 +169,11 @@ 7E 007e TILDE 7F 007f DELETE (DEL) -:ESC < -24 :ESC-$ -28 :ESC-( - -:ESC-( < -42 :US-ASCII -4A :JIS-0201 - -:ESC-$ < -40 :JIS-0208 -42 :JIS-0208 +# ESCAPE CODES +1B2440 :JIS-0208 ESC-$-@ +1B2442 :JIS-0208 ESC-$-B +1B2842 :US-ASCII ESC-(-B +1B284A :JIS-0201 ESC-(-J :JIS-0201 0 0000 NULL (NUL) @@ -209,7 +203,7 @@ 18 0018 CANCEL (CAN) 19 0019 END OF MEDIUM (EM) 1A 001a SUBSTITUTE (SUB) -1B >ESC +# 1B >ESC 1C 001c FILE SEPARATOR (IS4) 1D 001d GROUP SEPARATOR (IS3) 1E 001e RECORD SEPARATOR (IS2) @@ -375,8 +369,13 @@ DD FF9D HALFWIDTH KATAKANA LETTER N DE FF9E HALFWIDTH KATAKANA VOICED SOUND MARK DF FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +# ESCAPE CODES +1B2440 :JIS-0208 ESC-$-@ +1B2442 :JIS-0208 ESC-$-B +1B2842 :US-ASCII ESC-(-B +1B284A :JIS-0201 ESC-(-J + :JIS-0208 -1B >ESC 2121 3000 IDEOGRAPHIC SPACE 2122 3001 IDEOGRAPHIC COMMA 2123 3002 IDEOGRAPHIC FULL STOP @@ -7256,3 +7255,9 @@ DF FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK 7424 7464 7425 51DC 7426 7199 + +# ESCAPE CODES +1B2440 :JIS-0208 ESC-$-@ +1B2442 :JIS-0208 ESC-$-B +1B2842 :US-ASCII ESC-(-B +1B284A :JIS-0201 ESC-(-J diff --git a/lib/charset/iso-2022-kr.t b/lib/charset/iso-2022-kr.t index 1ded497..ba14972 100644 --- a/lib/charset/iso-2022-kr.t +++ b/lib/charset/iso-2022-kr.t @@ -67,7 +67,7 @@ 18 0018 CANCEL (CAN) 19 0019 END OF MEDIUM (EM) 1A 001a SUBSTITUTE (SUB) -1B >ESC +# 1B >ESC 1C 001c FILE SEPARATOR (IS4) 1D 001d GROUP SEPARATOR (IS3) 1E 001e RECORD SEPARATOR (IS2) @@ -169,14 +169,8 @@ 7E 007e TILDE 7F 007f DELETE (DEL) -:ESC < -24 :ESC-$ - -:ESC-$ < -29 :ESC-$-) - -:ESC-$-) < -43 :US-ASCII +# ESCAPE CODE (pretty pointless, but need to return nothing if we follow this path) +1B242943 :US-ASCII ESC-$-)-C :KSC-5601 0f :US-ASCII diff --git a/lib/chartable.h b/lib/chartable.h index 0b9c3fc..63dd2d0 100644 --- a/lib/chartable.h +++ b/lib/chartable.h @@ -42,31 +42,14 @@ * $Id: chartable.h,v 1.6 2008/03/24 17:43:08 murch Exp $ */ -/* note that these are all uppercase letters. since the translation - tables canonicalize to lower case letters, we never see these bytes - in the output UTF-8 and they're safely used as control codes to the - character decoder. */ - -/* note that currently we never return a character that is represented - * by more than 3 octets in UTF-8, since we only deal with characters - * in UCS-2. this means that 11110xxx, 111110xx, and 1111110x never - * appear in our outgoing tables, and could be used instead of the following. - */ - -#define XLT 'N' /* Long translation */ -#define U7F 'O' /* UTF-7 first base64 character */ -#define U7N 'P' /* UTF-7 subsquent base64 character */ -#define U83 'Q' /* UTF-8 3-char sequence */ -#define U83_2 'R' /* second char of same */ -#define U83_3 'S' /* third char of same */ -#define JSR 'T' -#define JMP 'U' -#define RET 'V' -#define END 'W' +struct charmap { + unsigned int c; + unsigned char next; +}; struct charset { char *name; - const unsigned char (*table)[256][4]; + const struct charmap (*table)[256]; }; diff --git a/lib/mkchartable.c b/lib/mkchartable.c deleted file mode 100644 index 1980258..0000000 --- a/lib/mkchartable.c +++ /dev/null @@ -1,975 +0,0 @@ -/* mkchartable.c -- Generate character set mapping table - * - * Copyright (c) 1994-2008 Carnegie Mellon University. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The name "Carnegie Mellon University" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For permission or any legal - * details, please contact - * Carnegie Mellon University - * Center for Technology Transfer and Enterprise Creation - * 4615 Forbes Avenue - * Suite 302 - * Pittsburgh, PA 15213 - * (412) 268-7393, fax: (412) 268-7395 - * innovation@andrew.cmu.edu - * - * 4. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by Computing Services - * at Carnegie Mellon University (http://www.cmu.edu/computing/)." - * - * CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO - * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY - * AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE - * FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - * $Id: mkchartable.c,v 1.28 2009/03/31 04:11:22 brong Exp $ - */ - -#include -#include -#include -#include -#include -#include -#include "xmalloc.h" -#include "util.h" - -#define XX 127 -/* - * Table for decoding hexadecimal - */ -static const char index_hex[256] = { - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,XX,XX, XX,XX,XX,XX, - XX,10,11,12, 13,14,15,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,10,11,12, 13,14,15,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, - XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, -}; -#define HEXCHAR(c) (index_hex[(unsigned char)(c)]) - -#define MAX_MAPCODE 20 - -struct cmap { - int code; - int num_mapcode; - int mapcode[MAX_MAPCODE]; - char *translation; - int trans_offset; -}; - -struct cmap *map=NULL; -int map_num=0; -int map_alloc=0; -#define MAPGROW 200 - -struct tablechar { - int code; - char *translation; - int trans_offset; - char *action; - char *comment; -}; -#define EMPTYTCHAR(tc) ((tc).code == -1 && !(tc).translation && !(tc).action) - -struct table { - char *name; - char *endaction; - struct tablechar ch[256]; -}; - -struct table *table=NULL; -int table_num=0; -int table_alloc=0; -#define TABLEGROW 200 - -static void readmapfile(char *name); -static void mungemappings(void); -static void readcharfile(char *name); -static void printtable(char *name); -static void freetabledata(void); -static void freetable(void); -static void freemap(void); -static void usage(void); -static int newstate(char *args); -static int findstate(char *name); -static void mkunicodetable(void); -static void mkutf8table(void); -static void mkutf7table(void); - -int -main(int argc, char **argv) -{ - int opt; - - while ((opt = getopt(argc, argv, "m:")) != EOF) { - switch (opt) { - case 'm': - readmapfile(optarg); - break; - - default: - usage(); - } - } - - if (map_num == 0 || argc == optind) usage(); - - printf("#include \"charset.h\"\n"); - printf("#include \"chartable.h\"\n"); - - mungemappings(); - - fprintf(stderr, "mkchartable: mapping unicode...\n"); - mkunicodetable(); - printtable("unicode"); - - fprintf(stderr, "mkchartable: mapping UTF-8...\n"); - mkutf8table(); - printtable("utf-8"); - - fprintf(stderr, "mkchartable: mapping UTF-7...\n"); - mkutf7table(); - printtable("utf-7"); - - while (argv[optind]) { - fprintf(stderr, "mkchartable: mapping %s...\n", argv[optind]); - readcharfile(argv[optind]); - printtable(argv[optind]); - freetabledata(); - optind++; - } - - printf("/*\n"); - printf(" * Mapping of character sets to tables\n"); - printf(" */\n"); - printf("const struct charset chartables_charset_table[] = {\n"); - printf(" { \"us-ascii\", chartables_us_ascii }, /* US-ASCII must be charset number 0 */\n"); - printf(" { \"utf-8\", chartables_utf_8 },\n"); - printf(" { \"utf-7\", chartables_utf_7 },\n"); - printf(" { \"iso-8859-1\", chartables_iso_8859_1 },\n"); - printf(" { \"iso-8859-2\", chartables_iso_8859_2 },\n"); - printf(" { \"iso-8859-3\", chartables_iso_8859_3 },\n"); - printf(" { \"iso-8859-4\", chartables_iso_8859_4 },\n"); - printf(" { \"iso-8859-5\", chartables_iso_8859_5 },\n"); - printf(" { \"iso-8859-6\", chartables_iso_8859_6 },\n"); - printf(" { \"iso-8859-7\", chartables_iso_8859_7 },\n"); - printf(" { \"iso-8859-8\", chartables_iso_8859_8 },\n"); - printf(" { \"iso-8859-9\", chartables_iso_8859_9 },\n"); - printf(" { \"koi8-r\", chartables_koi8_r },\n"); - printf(" { \"iso-2022-jp\", chartables_iso_2022_jp },\n"); - printf(" { \"iso-2022-kr\", chartables_iso_2022_kr },\n"); - printf(" { \"gb2312\", chartables_gb2312 },\n"); - printf(" { \"big5\", chartables_big5 },\n"); - printf(" /* Compatibility names */\n"); - printf(" { \"unicode-1-1-utf-7\", chartables_utf_7 },\n"); - printf(" { \"unicode-2-0-utf-7\", chartables_utf_7 },\n"); - printf(" { \"x-unicode-2-0-utf-7\", chartables_utf_7 },\n"); - printf(" /* End Compatibility Names */\n"); - printf(" { \"iso-8859-15\", chartables_iso_8859_15 },\n"); - printf(" { \"windows-1252\", chartables_windows_1252 },\n"); - printf(" { \"windows-1256\", chartables_windows_1256 },\n"); - printf(" { \"windows-1250\", chartables_windows_1250 },\n"); - printf(" { \"windows-1251\", chartables_windows_1251 },\n"); - printf(" { \"windows-1255\", chartables_windows_1255 },\n"); - printf(" /* New character sets should only be added to end so that\n"); - printf(" * cache files stay with valid information */\n"); - printf("};\n"); - printf("const int chartables_num_charsets = (sizeof(chartables_charset_table)/sizeof(*chartables_charset_table));\n"); - - freetable(); - freemap(); - - return 0; -} - -static void usage(void) -{ - fprintf(stderr, "usage: mkchartable -m mapfile charsetfile...\r\n"); - exit(1); -} - -/* Read a Unicode table, deriving useful mappings from it */ -static void -readmapfile(char *name) -{ - FILE *mapfile; - char buf[1024]; - char *p; - int line = 0; - int n, code, i, c; - static struct cmap zeromap; - - mapfile = fopen(name, "r"); - if (!mapfile) { - perror(name); - exit(1); - } - - while (fgets(buf, sizeof(buf), mapfile)) { - line++; - p = buf; - while (*p && Uisspace(*p)) p++; - if (!*p || *p == '#') continue; - - /* Unicode character */ - code = 0; - for (i=0; i<4; i++) { - c = HEXCHAR(*p); - p++; - if (c == XX) goto syntaxerr; - code = code*16 + c; - } - if (*p++ != ';') goto syntaxerr; - - /* Character name */ - while (*p && *p != ';') p++; - if (*p++ != ';') goto syntaxerr; - - if (map_num == map_alloc) { - map_alloc += MAPGROW; - map = (struct cmap *) - xrealloc((char *)map, map_alloc * sizeof(struct cmap)); - } - map[map_num] = zeromap; - map[map_num].code = code; - - /* General Category */ - if (*p == 'Z') { - /* Is whitespace, map to empty string */ - map[map_num].num_mapcode = 0; - map_num++; - continue; - } - while (*p && *p != ';') p++; - if (*p++ != ';') goto syntaxerr; - - /* Canonical Combining Class */ - while (*p && *p != ';') p++; - if (*p++ != ';') goto syntaxerr; - - /* Bidirectional category */ - while (*p && *p != ';') p++; - if (*p++ != ';') goto syntaxerr; - - /* Character decomposition */ - n = 0; - while (*p && *p != ';') { - if (n + 1 == MAX_MAPCODE) goto syntaxerr; - if (*p == '<') { - /* Compatability mapping, skip over the */ - p = strchr(p, '>'); - if (!p || p[1] != ' ') goto syntaxerr; - p += 2; - - /* Ignore compat mappings to SP followed by combining char */ - if (!strncmp(p, "0020 ", 5)) { - p = strchr(p, ';'); - break; - } - } - - code = 0; - for (i=0; i<4; i++) { - c = HEXCHAR(*p); - p++; - if (c == XX) goto syntaxerr; - code = code*16 + c; - } - if (*p == ' ') p++; - map[map_num].mapcode[n++] = code; - } - if (*p++ != ';') goto syntaxerr; - - /* Decimal digit value */ - while (*p && *p != ';') p++; - if (*p++ != ';') goto syntaxerr; - - /* Digit value */ - while (*p && *p != ';') p++; - if (*p++ != ';') goto syntaxerr; - - /* Numeric value */ - while (*p && *p != ';') p++; - if (*p++ != ';') goto syntaxerr; - - /* Mirrored character */ - while (*p && *p != ';') p++; - if (*p++ != ';') goto syntaxerr; - - /* Unicode 1.0 name */ - while (*p && *p != ';') p++; - if (*p++ != ';') goto syntaxerr; - - /* Comment */ - while (*p && *p != ';') p++; - if (*p++ != ';') goto syntaxerr; - - /* Upper case equivalent mapping */ - while (*p && *p != ';') p++; - if (*p++ != ';') goto syntaxerr; - - /* Lower case equivalent mapping */ - if (*p == ';') { - /* No case mapping, use any decomposition we found above */ - if (n) { - map[map_num].num_mapcode = n; - map_num++; - } - continue; - } - code = 0; - for (i=0; i<4; i++) { - c = HEXCHAR(*p); - p++; - if (c == XX) goto syntaxerr; - code = code*16 + c; - } - if (*p != ';') goto syntaxerr; - map[map_num].mapcode[0] = code; - map[map_num].num_mapcode = 1; - map_num++; - } - fclose(mapfile); - return; - syntaxerr: - fprintf(stderr, "%s: line %d: syntax error\n", name, line); - exit(1); -} - -/* Perform the transitive closure on the unicode mapping table - * Calculate translations for mappings - */ -static void -mungemappings(void) -{ - int didchange; - int n, newn, n_mapcode, i; - int new_mapcode[MAX_MAPCODE]; - int num_new_mapcode; - int last_translation = 1; - int max_len = 3; - - /* Keep scanning the table until no changes are made */ - do { - didchange = 0; - - fprintf(stderr, "mkchartable: expanding unicode mappings...\n"); - - for (n = 0; n < map_num; n++) { - /* Build new map code sequence by iterating over existing - * mapcode sequence - */ - num_new_mapcode = 0; - for (n_mapcode = 0; n_mapcode < map[n].num_mapcode; n_mapcode++) { - - /* Search for a translation of this particular code */ - for (newn = 0; newn < map_num; newn++) { - if (map[newn].code == map[n].mapcode[n_mapcode]) break; - } - if (newn != map_num) { - /* We have a translation */ - didchange++; - for (i = 0; i < map[newn].num_mapcode; i++) { - new_mapcode[num_new_mapcode++] = map[newn].mapcode[i]; - } - } - else { - /* Keep the old mapping for this code */ - new_mapcode[num_new_mapcode++] = map[n].mapcode[n_mapcode]; - } - } - - /* Copy in the new translation */ - map[n].num_mapcode = num_new_mapcode; - memcpy(map[n].mapcode, new_mapcode, sizeof(new_mapcode)); - } - } while (didchange); - - printf("/* The following unicode mapping table is in effect\n"); - printf("From To\n"); - for (n = 0; n < map_num; n++) { - printf("\n%04x", map[n].code); - for (i = 0; i < map[n].num_mapcode; i++) { - printf(" %04x", map[n].mapcode[i]); - } - } - printf("\n*/\n"); - - fprintf(stderr, "mkchartable: building expansion table...\n"); - - printf("/* Table of traslations longer than three octets.\n"); - printf(" * The XLT code in other tables is followed by an 2-octet\n"); - printf(" * index into this table.\n"); - printf(" * The index of 0 is reserved to mean 'no translation'\n"); - printf(" */\n"); - printf("const unsigned char chartables_long_translations[] = { 0, \n"); - - for (n = 0; n < map_num; n++) { - int n_mapcode, code; - unsigned char translation[256]; - int n_t; - - /* Build translation strings for mappings to 0 or multiple codes */ - if (map[n].num_mapcode == 0) { - map[n].translation = xstrdup(""); - } - else if (map[n].num_mapcode > 1) { - n_t = 0; - for (n_mapcode = 0; n_mapcode < map[n].num_mapcode; n_mapcode++) { - code = map[n].mapcode[n_mapcode]; - /* Convert code to UTF-8 */ - if (code && code <= 0x7f) { - translation[n_t++] = (unsigned char)code; - } - else if (code <= 0x7FF) { - translation[n_t++] = (unsigned char) (0xc0 + (code>>6)); - translation[n_t++] = (unsigned char) (0x80+(code&0x3f)); - } - else { - translation[n_t++] = (unsigned char) (0xe0 + (code>>12)); - translation[n_t++] = (unsigned char) (0x80+((code>>6)&0x3f)); - translation[n_t++] = (unsigned char) (0x80+(code&0x3f)); - } - } - if (n_t <= 3) { - map[n].translation = xmalloc(4); - memcpy(map[n].translation, translation, n_t); - map[n].translation[n_t] = '\0'; - } - else { - if (n_t > max_len) max_len = n_t; - for (i = 0; i < n_t; i++) { - code = translation[i]; - if (isprint(code) && code != '\\' && code != '\"' && code != '\'') { - printf(" '%c',", code); - } else { - printf(" %3d,", code); - } - } - printf(" END, /* Translation for %04x (offset %04x) */\n", - map[n].code, last_translation); - map[n].trans_offset = last_translation; - - /* last_translation points to the offset the next translation will start from */ - last_translation += n_t + 1; - } - } - } - printf("};\n\n const int charset_max_translation = %d;\n\n", max_len); -} - -static void -setcode(int state, int character, int code) -{ - int i = 0; - - for (i = 0; i < map_num; i++) { - if (map[i].code == code) break; - } - - if (i == map_num) { - table[state].ch[character].code = code; - } else if (map[i].translation) { - table[state].ch[character].translation = map[i].translation; - } else if (map[i].trans_offset) { - table[state].ch[character].trans_offset = map[i].trans_offset; - } else { - table[state].ch[character].code = map[i].mapcode[0]; - } - -} - -static void -readcharfile(char *name) -{ - FILE *charfile; - char buf[1024]; - char *p; - int line = 0; - int curstate = -1; - int thischar, thisstate; - int code, i, c; - - charfile = fopen(name, "r"); - if (!charfile) { - perror(name); - exit(1); - } - - table_num = 0; - - while (fgets(buf, sizeof(buf), charfile)) { - line++; - p = buf + strlen(buf); - if (p > buf && p[-1] == '\n') p[-1] = '\0'; - p = buf; - while (*p && Uisspace(*p)) p++; - if (!*p || *p == '#') continue; - - if (*p == ':') { - /* New state */ - curstate = newstate(p+1); - continue; - } - - if (curstate == -1) { - curstate = newstate(""); - } - - thisstate = curstate; - thischar = i = 0; - while (!Uisspace(*p)) { - c = HEXCHAR(*p); - i++; - p++; - if (c == XX) goto syntaxerr; - thischar = thischar*16 + c; - } - while (*p && Uisspace(*p)) p++; - - if (i > 4) goto syntaxerr; - if (i > 2) { - if (EMPTYTCHAR(table[thisstate].ch[thischar>>8])) { - /* we create a new state (not in the input file) to - deal with multibyte characters that start with the - byte 'thischar >> 8'. */ - - char action[1024]; - - sprintf(action, ">%s_%02x <", table[thisstate].name, - thischar>>8); - table[thisstate].ch[thischar>>8].action = xstrdup(action); - *(strchr(table[thisstate].ch[thischar>>8].action, ' ')) = '\0'; - table[thisstate].ch[thischar>>8].comment = xstrdup("multi-byte"); - thisstate = newstate(action+1); - } - else if (!table[thisstate].ch[thischar>>8].action || - table[thisstate].ch[thischar>>8].action[0] != '>') { - /* either we think this byte isn't the start of a - multibyte character, or the action associated with this - byte isn't a state change. */ - - fprintf(stderr, - "%s: line %d: multibyte/single-byte conflict\n", - name, line); - exit(1); - } - else { - /* we find the already created state to deal with multibytes - starting with 'thischar >> 8' and move to it so we - insert the 2nd byte of this multibyte char in the right - state. */ - - thisstate = - findstate(table[thisstate].ch[thischar>>8].action+1); - if (thisstate == -1) { - fprintf(stderr, - "%s: line %d: can't find multibyte state\n", - name, line); - exit(1); - } - } - thischar &= 0xff; - } - - if (!EMPTYTCHAR(table[thisstate].ch[thischar])) { - fprintf(stderr, "%s: line %d: duplicate defs for %x\n", - name, line, thischar); - exit(1); - } - - table[thisstate].ch[thischar].comment = xstrdup(buf); - - if (*p == '?') { - continue; - } - - if (*p == ':' || *p == '>' || *p == '<') { - p = table[thisstate].ch[thischar].action = xstrdup(p); - while (*p && !Uisspace(*p)) p++; - *p = '\0'; - continue; - } - - code = 0; - for (i=0; i<4; i++) { - c = HEXCHAR(*p); - p++; - if (c == XX) goto syntaxerr; - code = code*16 + c; - } - setcode(thisstate, thischar, code); - } - fclose(charfile); - return; - syntaxerr: - fprintf(stderr, "%s: line %d: syntax error\n", name, line); - exit(1); -} - -/* Generate the table used for mapping raw unicode values */ -static void mkunicodetable(void) -{ - int i; - int thisstate; - unsigned char need_block[256]; - int block; - char buf[80]; - - /* Record which blocks we need mappings for */ - for (i = 0; i < 256; i++) { - need_block[i] = 0; - } - for (i = 0; i < map_num; i++) { - need_block[map[i].code>>8] = 1; - } - - table_num = 0; - - printf("/* The next two tables are used for doing translations on\n"); - printf(" * 16-bit unicode values. First look up the Unicode block\n"); - printf(" * (high-order byte) in the chartables_unicode_block table\n"); - printf(" * to find the index into chartables_unicode for that block.\n"); - printf(" * If the index is 255, there are no translations for that\n"); - printf(" * block, so characters can be encoded in UTF-8 algorithmically\n"); - printf(" * Otherwise, look up the low-order byte in the chartables_unicode\n"); - printf(" * using the index to select the state.\n"); - printf(" */\n"); - printf("const unsigned char chartables_unicode_block[256] = {"); - - for (block = 0; block < 256; block++) { - if (!(block & 0x7)) printf("\n"); - if (!need_block[block]) { - printf(" 255,"); - continue; - } - - sprintf(buf, "BLOCK-%02x-INDEX-%d", block, table_num); - thisstate = newstate(buf); - printf(" %3d,", thisstate); - - for (i = 0; i < 256; i++) { - setcode(thisstate, i, (block << 8) + i); - } - } - - printf("\n};\n\n"); - - printf("/* NOTE: Unlike other charset translation tables, the \n"); - printf(" * chartables_unicode table is NOT used to directly parse\n"); - printf(" * a charset. See the comment on chartables_unicode_block\n"); - printf(" * for a descripton of how this table is used.\n"); - printf(" */\n"); -} - -static void mkutf8table(void) -{ - int start_state, thisstate; - int thischar, prefix; - char buf[80]; - - table_num = 0; - - start_state = newstate("START"); - - /* Populate the ascii section */ - for (thischar = 0; thischar <= 0x7f; thischar++) { - setcode(start_state, thischar, thischar); - } - - /* 3-char sequence tables must be numbered 1 and 2 */ - thisstate = newstate("STATE-3-2 <"); - for (thischar = 0x80; thischar <= 0xbf; thischar++) { - table[thisstate].ch[thischar].action = "U83_2"; - } - thisstate = newstate("STATE-3-3 <"); - for (thischar = 0x80; thischar <= 0xbf; thischar++) { - table[thisstate].ch[thischar].action = "U83_3"; - } - - /* Populate 2-char sequences---the first byte shifts to another - * state; the 2nd byte chooses the character, just like any other - * 2-byte encoding */ - for (prefix = 2; prefix <= 0x1f; prefix++) { - sprintf(buf, ">STATE-2-%02x", prefix); - table[start_state].ch[prefix+0xc0].action = xstrdup(buf); - strcat(buf, " <"); - thisstate = newstate(xstrdup(buf+1)); - for (thischar = 0; thischar <= 0x3f; thischar++) { - setcode(thisstate, thischar+0x80, thischar+(prefix<<6)); - } - } - - /* Populate 3-char sequences, which the decoder handles - * magically, outside of the state system. */ - for (thischar = 0xe0; thischar <= 0xef; thischar++) { - table[start_state].ch[thischar].action = "U83"; - } - -} - -static char basis_64[] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - -static void mkutf7table(void) -{ - int start_state, thisstate; - int thischar; - char *p; - - table_num = 0; - - start_state = newstate("START"); - - /* Populate the ascii section */ - table[start_state].ch['+'].action = ">GOTSHIFT"; - for (thischar = 0; thischar <= 0x7f; thischar++) { - if (!table[start_state].ch[thischar].action) { - setcode(start_state, thischar, thischar); - } - } - - /* Normal base64 decoding table must be numbered 1 */ - thisstate = newstate("B64NORMAL <"); - table[thisstate].ch['-'].action = "<"; - for (p = basis_64; *p; p++) { - table[thisstate].ch[*(unsigned char*)p].action = "U7N"; - } - for (thischar = 0; thischar <= 0x7f; thischar++) { - if (!table[thisstate].ch[thischar].action) { - setcode(thisstate, thischar, thischar); - } - } - - /* Populate initial base64 decoding table */ - thisstate = newstate("GOTSHIFT <"); - setcode(thisstate, '-', '+'); - for (p = basis_64; *p; p++) { - table[thisstate].ch[*(unsigned char*)p].action = "U7F"; - } -} - -static int -newstate(char *args) -{ - char *p; - int i; - - if (table_num == table_alloc) { - table_alloc += TABLEGROW; - table = (struct table *)xrealloc((char *)table, - table_alloc * sizeof(struct table)); - } - - table[table_num].name = xstrdup(args); - table[table_num].endaction = "END"; - for (i = 0; i < 256; i++) { - table[table_num].ch[i].code = -1; - table[table_num].ch[i].translation = 0; - table[table_num].ch[i].trans_offset = 0; - table[table_num].ch[i].action = 0; - table[table_num].ch[i].comment = 0; - } - - p = table[table_num].name; - while (*p && !Uisspace(*p)) p++; - if (*p) *p++ = '\0'; - while (*p) { - if (*p == '<') table[table_num].endaction = "RET"; - p++; - } - - return table_num++; -} - -static int -findstate(char *name) -{ - int i; - - for (i = 0; i < table_num; i++) { - if (!strcmp(name, table[i].name)) return i; - } - return -1; -} - -static void -printtable(char *name) -{ - char buf[1024]; - char *p; - int curstate, thischar; - int code; - char *end; - int i; - - p = strrchr(name, '/'); - if (!p) p = strrchr(name, '\\'); - if (p) p++; - else p = name; - strcpy(buf, p); - if ((p = strchr(buf, '.')) != NULL) *p = '\0'; - while ((p = strchr(buf, '-')) != NULL) *p = '_'; - - printf("const unsigned char chartables_%s[%d][256][4] = {\n", buf, table_num); - - for (curstate = 0; curstate < table_num; curstate++) { - printf(" {"); - if (table[curstate].name[0]) { - printf(" /* %s */", table[curstate].name); - } - printf("\n"); - - for (thischar = 0; thischar < 256; thischar++) { - printf(" {"); - if ((code = table[curstate].ch[thischar].code) != -1) { - if (code && code <= 0x7f) { - if (isprint(code) && code != '\\' && code != '\"' && - code != '\'') { - printf(" '%c', %s, 0, 0,", code, - table[curstate].endaction); - } - else { - printf(" %3d, %s, 0, 0,", code, - table[curstate].endaction); - } - } - else if (code <= 0x7FF) { - printf(" %3d, %3d, %s, 0,", 0xc0 + (code>>6), - 0x80+(code&0x3f), table[curstate].endaction); - } - else { - printf(" %3d, %3d, %3d, %s,", 0xe0 + (code>>12), - 0x80+((code>>6)&0x3f), 0x80+(code&0x3f), - table[curstate].endaction); - } - } else if ((code = table[curstate].ch[thischar].trans_offset) != 0) { - printf(" XLT, %3d, %3d, %s,", code >> 8, code & 0xff, - table[curstate].endaction); - } else if ((p = table[curstate].ch[thischar].translation) != 0) { - end = table[curstate].endaction; - for (i = 0; i < 4; i++) { - if (isprint((unsigned char)*p) && *p != '\\' && *p != '\"' && *p != '\'') { - printf(" '%c',", *p); - } - else if (!*p) { - printf(" %s,", end); - end = " 0"; - } - else { - printf(" %3d,", (unsigned char)*p); - } - if (*p) p++; - } - } - else if ((p = table[curstate].ch[thischar].action) == 0) { - printf(" EMPTY, %s, 0, 0,", table[curstate].endaction); - } - else if (*p == '<') { - printf(" RET, 0, 0, 0,"); - } - else if (*p == 'U') { - printf(" %s, 0, 0, 0,", p); - } - else { - code = findstate(p+1); - if (code == -1) { - fprintf(stderr, "%s: unknown state %s\n", name, p+1); - } - printf(" %s, %3d, %3d, 0,", - *p == '>' ? "JSR" : "JMP", - (code>>8), (code&0xff)); - } - printf(" },"); - if (table[curstate].ch[thischar].comment) { - printf(" /* %s */", table[curstate].ch[thischar].comment); - } - printf("\n"); - } - printf(" },\n"); - } - printf("};\n\n"); -} - -static void -freetabledata(void) -{ - int curstate, thischar; -/* char *cp; */ - - for (curstate = 0; curstate < table_num; curstate++) { - for (thischar = 0; thischar < 256; thischar++) { - if (table[curstate].ch[thischar].comment != NULL) { - free(table[curstate].ch[thischar].comment); - } - - if (table[curstate].ch[thischar].action != NULL) { - free(table[curstate].ch[thischar].action); - } - } - if (table[curstate].name != NULL) { - free(table[curstate].name); - } - } -} - -static void -freetable(void) -{ - if (table_alloc) { - free(table); - table_alloc=0; - } -} - -static void -freemap(void) -{ - int n; -/* int n_mapcode; */ - - for (n = 0; n < map_num; n++) { - if (map[n].translation != NULL) { - free(map[n].translation); - } - } - - if (map_alloc) { - free(map); - map_alloc=0; - } -} - -void fatal(const char* s, int c) -{ - fprintf(stderr, "Error while building charset table: %s\n", s); - exit(c); -} diff --git a/lib/mkchartable.pl b/lib/mkchartable.pl new file mode 100644 index 0000000..b9fbbeb --- /dev/null +++ b/lib/mkchartable.pl @@ -0,0 +1,531 @@ +#!/usr/bin/perl +# +# mkchartable.pl -- Generate character set mapping table +# +# Copyright (c) 1994-2008 Carnegie Mellon University. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# 3. The name "Carnegie Mellon University" must not be used to +# endorse or promote products derived from this software without +# prior written permission. For permission or any legal +# details, please contact +# Carnegie Mellon University +# Center for Technology Transfer and Enterprise Creation +# 4615 Forbes Avenue +# Suite 302 +# Pittsburgh, PA 15213 +# (412) 268-7393, fax: (412) 268-7395 +# innovation@andrew.cmu.edu +# +# 4. Redistributions of any form whatsoever must retain the following +# acknowledgment: +# "This product includes software developed by Computing Services +# at Carnegie Mellon University (http://www.cmu.edu/computing/)." +# +# CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO +# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +# AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE +# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN +# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING +# OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# +# $Id: mkchartable.pl,v 1.27 2008/03/24 17:43:09 murch Exp $ + +use strict; +use warnings; + +use IO::File; +use Getopt::Long; +use Digest::SHA1; + +my @maps; +my %codemap; +GetOptions( 'map|m=s' => \@maps ); + +printheader(\@maps, \@ARGV); + +# first we parse the chartable unicode mappings and the fixes +# file to build the unicode to search canonical form tables. +foreach my $map (@maps) { + readmapfile(\%codemap, $map); +} + +# we follow any mappings repeatedly until nothing in the +# table doesn't change any more +mungemap(\%codemap); + +# then print out the translation tables +printmap(\%codemap); + +# XXX - should probably require all files that are +# mentioned in the lookup table to be specified, +# or this sucker aintn't gunna compile. +foreach my $opt (@ARGV) { + warn "mkchartable: mapping $opt...\n"; + my $table = readcharfile($opt); + printtable($table, $opt); +} + +printlookup(); + +exit 0; + +sub usage { + warn "usage: mkchartable -m mapfile charsetfile...\n"; + exit(1); +} + +# Read a Unicode table, deriving useful mappings from it +sub readmapfile { + my ($codemap, $name) = @_; + + my $mapfile = IO::File->new($name, 'r') || die "Failed to open $name\n"; + + while (my $line = $mapfile->getline()) { + chomp $line; + $line =~ s/^\s+//; # strip leading space + next if $line =~ m/^\#/; # comment + next if $line eq ''; # blank + + my ($hexcode, $name, $category, $combiningclass, $bidicat, + $decomposition, $decimal, $digit, $numeric, $mirroredchar, + $uni1name, $comment, $upper, $lower, @rest) = split ';', $line; + my $code = hex($hexcode); + + if ($code != 32 and $category =~ m/^Z/) { + $codemap->{$code}{chars} = [32]; # space + next; + } + + # Compatability mapping, skip over the + while ($decomposition ne '') { + if ($decomposition =~ s/^<[^>]*>\s+//) { + # Ignore compat mappings to SP followed by combining char + $decomposition = '' if $decomposition =~ m/^0020 / + } + + if ($decomposition =~ s/([0-9a-fA-F]+)\s*//) { + push @{$codemap->{$code}{chars}}, hex($1); + } + } + + # Lower case equivalent mapping + if ($lower) { + $codemap->{$code}{chars} = [hex($lower)]; + } + } +} + +# Perform the transitive closure on the unicode mapping table +# Calculate translations for mappings +sub mungemap { + my ($codemap) = @_; + + my $didchange = 1; + + # Keep scanning the table until no changes are made + while ($didchange) { + warn "mkchartable: expanding unicode mappings...\n"; + + $didchange = 0; + + foreach my $code (sort { $a <=> $b } keys %$codemap) { + my @new; + my $chars = $codemap->{$code}{chars}; + + # check if there are any translations for the mapped chars + foreach my $char (@$chars) { + if ($codemap->{$char}) { + $didchange = 1; + my $newchars = $codemap->{$char}{chars}; + push @new, @$newchars; + } + else { + push @new, $char; + } + } + + # strip all whitespace, but put back one if nothing left + if (grep { $_ == 32 } @new) { + @new = grep { $_ != 32 } @new; + @new = (32) unless @new; + } + + $codemap->{$code}{chars} = \@new; + } + }; + + warn "mkchartable: building expansion table...\n"; + + print < $b } keys %$codemap) { + my $chars = $codemap->{$code}{chars}; + if (@$chars > 1) { + $maxlen = @$chars if $maxlen < @$chars; + + # add to the translation table + print " "; + print join(", ", (map { sprintf("0x%04x", $_) } @$chars)); + printf ", 0, /* Translation for %04x (offset %d) */\n", $code, $offset; + + # update tracking + $codemap->{$code}{trans} = $offset; + $offset += @$chars + 1; + } + } + + print <> 16) & 0xff][($code >> 8) & 0xff] = 1; + } + + print << "EOF"; +/* The next two tables are used for doing translations from + * 24-bit unicode values to canonical form. First look up the + * code >> 16 (highest order block) in the block16 table to + * find the index to the block8 table for that block. + * If the index is 255, there are no translations for that + * block, so return the same value. Otherwise, repeat for + * code >> 8 (middle block) to get an index into the + * direct translation block. Again, 255 means no translations + * for that block. Finally the translation can be one of. + * + * 0: no output + * +ve char: return this single char + * -ve number: offset into the chartables_translation_multichar + * table. Read chars until 0 encountered. + */ +const unsigned char chartables_translation_block16[256] = { +EOF + + my $n16 = 0; + foreach my $block16 (0..255) { + if ($needblock[$block16]) { + printf(" %3d,", $n16++); + } else { + printf(" 255,"); + } + print "\n" if ($block16 % 8 == 7); + } + + print <[$block8]) { + printf(" %3d,", $n8++); + } else { + printf(" 255,"); + } + print "\n " if ($block8 % 8 == 7); + } + print "},\n"; + } + + print <[$block8]; + print " { /* Mapping for unicode chars in block $block16 $block8 */\n "; + foreach my $i (0..255) { + my $codepoint = ($block16 << 16) + ($block8 << 8) + $i; + if (not $codemap->{$codepoint}) { + printf " 0x%04x,", $codepoint; + } + elsif ($codemap->{$codepoint}{trans}) { + printf " - %4d,", $codemap->{$codepoint}{trans}; + } + else { + printf " 0x%04x,", $codemap->{$codepoint}{chars}[0]; + } + print "\n " if ($i % 8 == 7); + } + print "},\n"; + } + } + printf("};\n\n"); +} + +# read a charset table, building intermediate state tables +# for multibyte sequences and named state tables for mode +# switches +sub readcharfile { + my ($name) = @_; + + my $charfile = IO::File->new($name, 'r') || die "Failed to read $name"; + + my %data = ( + currstate => -1, + num => 0, + tables => [], + states => {}, + ); + + my $state; + + while (my $line = $charfile->getline()) { + chomp $line; + my $comment = $line; + $line =~ s/^\s+//; # strip leading space + next if $line =~ m/^\#/; # comment + next if $line eq ''; # blank + + if ($line =~ m/^:(\S+)/) { + # New state + $state = getstate(\%data, $1); + next; + } + + $state ||= getstate(\%data, ""); + + die "Invalid data line $line\n" unless $line =~ s/^([0-9a-fA-F]+)\s+//; + + my $code = hex($1); + + my $basestate = $state; + + if ($code > 0xffffff) { + my $char = ($code >> 24) & 0xff; + my $newname = sprintf "%s_%02x", $state->{name} || 'state', $char; + my $newstate = getstate(\%data, $newname); + $state->{chars}[$char] = [0, $newstate->{num}, "Auto multibyte state 4 bytes $newname"]; + $state = $newstate; + } + if ($code > 0xffff) { + my $char = ($code >> 16) & 0xff; + my $newname = sprintf "%s_%02x", $state->{name} || 'state', $char; + my $newstate = getstate(\%data, $newname); + $state->{chars}[$char] = [0, $newstate->{num}, "Auto multibyte state 3 bytes $newname"]; + $state = $newstate; + } + if ($code > 0xff) { + my $char = ($code >> 8) & 0xff; + my $newname = sprintf "%s_%02x", $state->{name} || 'state', $char; + my $newstate = getstate(\%data, $newname); + $state->{chars}[$char] = [0, $newstate->{num}, "Auto multibyte state 2 bytes $newname"]; + $state = $newstate; + } + + my $char = $code & 0xff; + die "Duplicate defs for $char in $state->{name}" + if $state->{chars}[$char]; + + # nothing + if ($line =~ m/^\?/) { + next; + } + + # state switch + if ($line =~ m/^:(\S*)/) { + my $targetstate = getstate(\%data, $1); + $state->{chars}[$char] = [0, $targetstate->{num}, $comment]; + } + else { + # otherwise it's a regular char + die "Invalid data line $line\n" unless $line =~ s/^([0-9a-fA-F]+)\s+//; + my $target = hex($1); + $state->{chars}[$char] = [$target, $basestate->{num}, $comment]; + } + + $state = $basestate; + } + + return \%data; +} + +# helper function to create a new state within a charset +sub getstate { + my ($data, $name) = @_; + + if (exists $data->{states}{$name}) { # could be 0 + return $data->{tables}[$data->{states}{$name}]; + } + + my $num = $data->{num}; + + my $next = $num; + if ($name =~ s/ \<$//) { + $next = -1; + } + + my $state = $data->{tables}[$num] = { + name => $name, + num => $num, + next => $next, + codes => {}, + }; + $data->{states}{$name} = $num; + + $data->{num}++; + + return $state; +} + +# output the table used for charset->unicode translation +sub printtable { + my ($data, $name) = @_; + + my $num = $data->{num}; + my $tables = $data->{tables}; + + $name =~ s{.*[\\/]}{}; # strip anything up to the last separator; + $name =~ s{\..*}{}; # after a dot + $name =~ s{-}{_}g; # underscores + + print "const struct charmap chartables_$name\[$num][256] = {\n"; + + foreach my $table (@$tables) { + my $chars = $table->{chars}; + print " {"; + if ($table->{name}) { + print " /* $table->{name} */"; + } + print "\n"; + foreach my $i (0..255) { + my $char = $chars->[$i]; + if ($char) { + print " { $char->[0], $char->[1] }, /* $char->[2] */\n"; + } + else { + print " { 0, 0 }, /* no entry */\n"; + } + } + print " },\n"; + } + print "};\n\n"; +} + +# print the header of the chartable.c file +sub printheader { + my ($maps, $charsets) = @_; + + print <new($file, 'r') || return ""; + my $digest = Digest::SHA1->new(); + $digest->addfile($fh); + return $digest->hexdigest(); +} + +__END__ -- 1.5.6.5