diff --git a/VersionNum b/VersionNum index 7c913d4a8aa00e84f9bd72cc1edeef8bc070d661..0fbcae5d571a896616c7e155ce8b7a5df6aac814 100644 --- a/VersionNum +++ b/VersionNum @@ -1,23 +1,23 @@ -/* (0.55) +/* (0.56) * * This file is automatically maintained by srccommit, do not edit manually. - * Last processed by srccommit version: 1.2. + * Last processed by srccommit version: 1.1. * */ -#define Module_MajorVersion_CMHG 0.55 +#define Module_MajorVersion_CMHG 0.56 #define Module_MinorVersion_CMHG -#define Module_Date_CMHG 26 Aug 2005 +#define Module_Date_CMHG 05 Dec 2008 -#define Module_MajorVersion "0.55" -#define Module_Version 55 +#define Module_MajorVersion "0.56" +#define Module_Version 56 #define Module_MinorVersion "" -#define Module_Date "26 Aug 2005" +#define Module_Date "05 Dec 2008" -#define Module_ApplicationDate "26-Aug-05" +#define Module_ApplicationDate "05-Dec-08" #define Module_ComponentName "Unicode" -#define Module_ComponentPath "RiscOS/Sources/Lib/Unicode" +#define Module_ComponentPath "castle/RiscOS/Sources/Lib/Unicode" -#define Module_FullVersion "0.55" -#define Module_HelpVersion "0.55 (26 Aug 2005)" -#define Module_LibraryVersionInfo "0:55" +#define Module_FullVersion "0.56" +#define Module_HelpVersion "0.56 (05 Dec 2008)" +#define Module_LibraryVersionInfo "0:56" diff --git a/c/enc_utf8 b/c/enc_utf8 index a952b1ef8e89b971b0549935e3678ab9e027d92a..b517154cc6db8d5a9e8f8544946e8b12ce3e43ef 100644 --- a/c/enc_utf8 +++ b/c/enc_utf8 @@ -52,7 +52,7 @@ static int utf8_reset(Encoding *e, int for_encoding) } static unsigned int utf8_read(Encoding *e, - encoding_read_callback_fn ucs_out, + encoding_read_callback_fn ucs_out, const unsigned char *s, unsigned int n, void *handle) @@ -78,22 +78,22 @@ static unsigned int utf8_read(Encoding *e, } else { + /* Reset the count of expected continuation bytes */ + ue->count = 0; + if (ucs_out) if (ucs_out(handle, 0xFFFD)) { - /* Character has been used, so ensure its counted */ - count--; + /* Do not consume the invalid continuation byte */ break; } - ue->count = 0; - goto retry; } } else { - if (c <= 0x80) + if (c < 0x80) u = c; else if (c < 0xC0 || c >= 0xFE) u = 0xFFFD; @@ -115,10 +115,14 @@ static unsigned int utf8_read(Encoding *e, ue->first = 0; + /* Reject surrogates and FFFE/FFFF */ + if ((0xD800 <= u && u <= 0xE000) || u == 0xFFFE || u == 0xFFFF) + u = 0xFFFD; + if (ucs_out) if (ucs_out(handle, u)) { - /* Character has been used, so ensure its counted */ + /* Character has been used, so ensure it's counted */ count--; break; } diff --git a/c/encoding b/c/encoding index 84bd8f80c13f8d876d884903fefde4b5efecab4b..e919a2b909d05010b444896530b36bb6b5561df8 100644 --- a/c/encoding +++ b/c/encoding @@ -64,20 +64,20 @@ static char version[] = "Unicode library " Module_MajorVersion " " Module_Date " static EncList enclist[] = { { csASCII /* 3 */, 1, "/US-ASCII/", lang_ENGLISH, &enc_ascii, NULL, NULL }, - { csISOLatin1 /* 4 */, 1, "/ISO-8859-1/ISO-IR-100/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x41\x1B\x2E\x42\x1B\x2F\x50", NULL }, /* Select G1 Latin-1, G2 Latin-2, G3 supplement */ - { csISOLatin2 /* 5 */, 1, "/ISO-8859-2/ISO-IR-101/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x42\x1B\x2E\x41\x1B\x2F\x50", NULL }, /* Select G1 Latin-2, G2 Latin-1, G3 supplement */ + { csISOLatin1 /* 4 */, 1, "/ISO-8859-1/ISO-IR-100/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x41"/*\x1B\x2E\x42\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-1, G2 Latin-2, G3 supplement */ + { csISOLatin2 /* 5 */, 1, "/ISO-8859-2/ISO-IR-101/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x42"/*\x1B\x2E\x41\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-2, G2 Latin-1, G3 supplement */ { csISOLatin3 /* 6 */, 1, "/ISO-8859-3/ISO-IR-109/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x43", NULL }, /* Select Latin-3 right half */ { csISOLatin4 /* 7 */, 1, "/ISO-8859-4/ISO-IR-110/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x44", NULL }, /* Select Latin-4 right half */ { csISOLatinCyrillic /* 8 */, 1, "/ISO-8859-5/ISO-IR-144/", lang_RUSSIAN, &enc_iso8859, "\x1B\x2D\x4C", NULL }, /* Select Cyrillic right half */ { csISOLatinGreek /* 10 */, 1, "/ISO-8859-7/ISO-IR-126/", lang_GREEK, &enc_iso8859, "\x1B\x2D\x46", NULL }, /* Select Greek right half */ { csISOLatinHebrew /* 11 */, 1, "/ISO-8859-8/ISO-IR-198/", lang_HEBREW, &enc_iso8859, "\x1B\x2D\x5E", NULL }, /* Select Hebrew right half */ - { csISOLatin5 /* 12 */, 1, "/ISO-8859-9/ISO-IR-148/", lang_TURKISH, &enc_iso8859, "\x1B\x2D\x4D\x1B\x2E\x42\x1B\x2F\x50", NULL }, /* Select G1 Latin-5, G2 Latin-2, G3 supplement */ - { csISOLatin6 /* 13 */, 1, "/ISO-8859-10/ISO-IR-157/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x56\x1B\x2E\x58", NULL }, /* Select Latin-6 right half, and Sami supplement as G2 */ + { csISOLatin5 /* 12 */, 1, "/ISO-8859-9/ISO-IR-148/", lang_TURKISH, &enc_iso8859, "\x1B\x2D\x4D"/*\x1B\x2E\x42\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-5, G2 Latin-2, G3 supplement */ + { csISOLatin6 /* 13 */, 1, "/ISO-8859-10/ISO-IR-157/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x56"/*\x1B\x2E\x58"*/, NULL }, /* Select Latin-6 right half, and Sami supplement as G2 */ { csISOLatinThai, 1, "/ISO-8859-11/ISO-IR-166/", lang_THAI, &enc_iso8859, "\x1B\x2D\x54", NULL }, /* Select Thai right half */ { csISOLatin7, 1, "/ISO-8859-13/ISO-IR-179/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x59", NULL }, /* Select Baltic Rim right half */ { csISOLatin8, 1, "/ISO-8859-14/ISO-IR-199/", lang_IRISH, &enc_iso8859, "\x1B\x2D\x5F", NULL }, /* Select Celtic right half */ - { csISOLatin9, 1, "/ISO-8859-15/ISO-IR-203/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x62\x1B\x2E\x42\x1B\x2F\x50", NULL }, /* Select G1 Latin-9, G2 Latin-2, G3 supplement */ - { csISOLatin10, 1, "/ISO-8859-16/ISO-IR-226/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x66\x1B\x2E\x41\x1B\x2F\x50", NULL }, /* Select G1 Latin-10, G2 Latin-1, G3 supplement */ + { csISOLatin9, 1, "/ISO-8859-15/ISO-IR-203/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x62"/*\x1B\x2E\x42\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-9, G2 Latin-2, G3 supplement */ + { csISOLatin10, 1, "/ISO-8859-16/ISO-IR-226/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x66"/*\x1B\x2E\x41\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-10, G2 Latin-1, G3 supplement */ { csISO6937, 2, "/ISO-IR-156/", lang_ENGLISH, &enc_iso6937, "\x1B\x2D\x52", NULL }, /* Select ISO6937 right half */ { csISO6937DVB, 2, "/X-ISO-6937-DVB/X-DVB/", lang_ENGLISH, &enc_iso6937, NULL, NULL }, { csShiftJIS /* 17 */, 2, "/SHIFT_JIS/X-SJIS/", lang_JAPANESE, &enc_shiftjis, NULL, NULL }, diff --git a/c/iso2022 b/c/iso2022 index ca7418c7dcb54aa341660e276d2efeffce5fe762..ace3799db811939e1a530ac78ef891a1cb690f26 100644 --- a/c/iso2022 +++ b/c/iso2022 @@ -29,6 +29,7 @@ #include <stdio.h> #include <string.h> +#include "charsets.h" #include "encpriv.h" #include "iso2022.h" @@ -63,8 +64,24 @@ struct ISO2022_Encoding unsigned char tempset; ISO2022_Set *oldset; + /* Whether escape sequences are disabled + * + * Value: Meaning: + * 0 All escape sequences enabled + * 1 Only SS2/3 escape sequences enabled + * 2 All escape sequences disabled + */ unsigned char esc_disabled; + /* Whether C1 control characters are permitted + * + * Value: Meaning: + * 0 No C1 control characters permitted + * 1 Only 0x8E/0x8F permitted + * 2 All C1 control characters permitted + */ + unsigned char c1_permitted; + /* Pending escape commands */ unsigned char esc_pending; unsigned char esc_multi; @@ -133,7 +150,7 @@ static UCS4 simple_double_next_code_94(ISO2022_Set *s, int c, int invoker, unsig if (c == 0x00 || c == 0x5F) { *sync = 0; - return c + 0x20; + return invoker == _GL ? c + 0x20 : 0xFFFD; } if (!*sync) @@ -147,8 +164,6 @@ static UCS4 simple_double_next_code_94(ISO2022_Set *s, int c, int invoker, unsig *sync = 0; return u; } - - NOT_USED(invoker); } static UCS4 null_double_next_code(ISO2022_Set *s, int c, int invoker, unsigned char *sync) @@ -376,6 +391,7 @@ static int iso2022_reset(Encoding *e, int for_encoding) i->CR_s = C1; i->GR_s = G1; + i->c1_permitted = 2; i->esc_disabled = 0; i->esc_pending = i->esc_revision = 0; i->tempset = 0; @@ -391,18 +407,41 @@ static int iso2022_reset(Encoding *e, int for_encoding) iso2022_select_set(i, C0, 32, C0_ISO646); iso2022_select_set(i, C1, 32+1, C1_ISO6429); - /* ISO8859 and EUC variants of IOS2022 require preloading with + /* ISO8859 and EUC variants of ISO2022 require preloading with escape sequences to get the appropriate tables */ if (e->list_entry->preload) { + char euc = 0; unsigned int n = strlen(e->list_entry->preload); if (n != e->read(e, NULL, (unsigned char *)e->list_entry->preload, n, NULL)) return 0; - /* if we've preloaded then we need to disable further escape - * sequences otherwise stray control sequences (eg 8E, 8F) - * will try and switch tables */ - i->esc_disabled = 1; + if (e->list_entry->identifier == csEUCPkdFmtJapanese || + /* e->list_entry->identifier == csKSC56011987 || */ + e->list_entry->identifier == csEUCKR || + e->list_entry->identifier == csGB2312) + { + euc = 1; + } + + /* If we've preloaded and we're not handling an EUC variant + * then we need to disable further escape sequences otherwise + * stray control sequences (eg 8E, 8F) will try and switch tables. + * + * If we're handling an EUC variant which has loaded tables into + * G2 and G3, then SS2/SS3 are permitted. */ + if (euc && ((simple_set *)i->Set[G2])->table && + ((simple_set *)i->Set[G3])->table) + { + i->esc_disabled = 1; + i->c1_permitted = 1; + } + else + { + i->esc_disabled = 2; + if (euc) + i->c1_permitted = 0; + } } if (for_encoding != encoding_READ) @@ -602,6 +641,12 @@ static unsigned int iso2022_read(EncodingPriv *e, { u = 0xFFFD; } + /* or illegal continuation bytes */ + else if ((i->sync[_GL] && (c < 0x20 || c > 0x7F)) || + (i->sync[_GR] && (c < 0xA0))) + { + u = 0xFFFD; + } else if (i->esc_pending) { u = iso2022_esc_cont(i, c); @@ -619,7 +664,15 @@ static unsigned int iso2022_read(EncodingPriv *e, else if (c < 0xA0) { i->sync[_GL] = i->sync[_GR] = 0; - u = i->CR->next_code(i->CR, c - 0x80, _CR, NULL); + if (i->c1_permitted == 2 || + (i->c1_permitted == 1 && (c == 0x8E || c == 0x8F))) + { + u = i->CR->next_code(i->CR, c - 0x80, _CR, NULL); + } + else + { + u = 0xFFFD; + } } else { @@ -635,9 +688,9 @@ static unsigned int iso2022_read(EncodingPriv *e, break; case 0x0E: if (!i->esc_disabled) { iso2022_ls(i, G1); continue; } break; - case 0x8E: if (!i->esc_disabled) { iso2022_ss(i, G2); continue; } + case 0x8E: if (i->esc_disabled < 2) { iso2022_ss(i, G2); continue; } break; - case 0x8F: if (!i->esc_disabled) { iso2022_ss(i, G3); continue; } + case 0x8F: if (i->esc_disabled < 2) { iso2022_ss(i, G3); continue; } break; } @@ -874,6 +927,8 @@ static int iso2022_scan_sets(ISO2022_Encoding *enc, UCS4 u, int *index, int *tab /* UNIDBG(("scan_table: set %d table %p\n", set, setptr->table)); */ + if (setptr->table == NULL) continue; + if ((i = encoding_lookup_in_table(u, setptr->table)) != -1) { *index = i; @@ -904,8 +959,10 @@ static int iso2022_write_euc(EncodingPriv *e, UCS4 u, unsigned char **euc, int * retry: /* control chars */ - if (u < 0x0021) - buf[out++] = u; + if (u < 0x0021 || u == 0x007F) + buf[out++] = u; + else if ((enc->c1_permitted == 2 && 0x0080 <= u && u <= 0x009F)) + buf[out++] = u; /* main chars */ else if (iso2022_scan_sets(enc, u, &index, &set, &n_entries)) diff --git a/c/iso6937 b/c/iso6937 index b5d7c7f08674c7525fc4c7327f0631d31d4e09cd..e8da112a8bf944b7fc5347dd1e5340521395a138 100644 --- a/c/iso6937 +++ b/c/iso6937 @@ -351,11 +351,13 @@ static UCS4 iso6937_combine(Accent a, unsigned char letter) static int iso6937_find_accent_pair(UCS4 u) { - for (int a = 1; a <= 15; a++) + int a, i; + + for (a = 1; a <= 15; a++) { if (iso6937_combination_table[a].combination) { - for (int i = 0; i < iso6937_combination_table[a].ncombinations; i++) + for (i = 0; i < iso6937_combination_table[a].ncombinations; i++) { if (iso6937_combination_table[a].combination[i].u == u) { diff --git a/c/johab b/c/johab index b58702de4762baeb75af6524b4554d942a9574ee..2f7976039be58d415bd73c4b1728c2da25ce4a99 100644 --- a/c/johab +++ b/c/johab @@ -113,10 +113,10 @@ static UCS4 hangul_to_ucs(unsigned int c1, unsigned int c2) /* Hangul is --X */ static const unsigned char final_only[28] = { - 0, 0, 0, 0x33, 0, 0x35, 0x36, 0, - 0, 0, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, - 0x40, 0, 0, 0, 0x44, 0, 0, 0, - 0, 0, 0, 0 + 0, 0, 0, 0x33, 0, 0x35, 0x36, + 0, 0, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, + 0x3F, 0x40, 0, 0, 0x44, 0, 0, + 0, 0, 0, 0, 0, 0, 0 }; u = 0x3100 + final_only[final]; @@ -247,8 +247,10 @@ static unsigned int johab_read(Encoding *e, } else { - if (c < 0x80) + if (c < 0x80 && c != 0x5C) /* Standard ASCII... */ u = c; + else if (c == 0x5C) /* ...except 0x5C, which maps to Won */ + u = 0x20A9; else if ((c >= 0x84 && c <= 0xD3) || (c >= 0xD8 && c <= 0xDE) || (c >= 0xE0 && c <= 0xF9)) @@ -330,13 +332,15 @@ static int ucs_jamo_to_johab(UCS4 u) static int johab_write(EncodingPriv *e, UCS4 u, unsigned char **johab, int *bufsize) { Johab_Encoding *je = (Johab_Encoding *) e; - int c = '?'; + int c = 0xFFFD; if (u == NULL_UCS4) return 0; - if (u <= 0x7F) /* Basic Latin */ + if (u <= 0x7F && u != 0x5C) /* Basic Latin */ c = u; + else if (u == 0x20A9) /* Won Sign, mapped to 0x5C */ + c = 0x5C; else if (u >= 0xAC00 && u <= 0xD7A3) /* Hangul syllables */ c = ucs_hangul_to_johab(u); else if (u >= 0x3131 && u <= 0x3163) /* Modern Jamo */ @@ -370,6 +374,11 @@ static int johab_write(EncodingPriv *e, UCS4 u, unsigned char **johab, int *bufs } } + if (c == 0xFFFD && e->for_encoding == encoding_WRITE_STRICT) + return -1; + else if (c == 0xFFFD) + c = '?'; + if ((*bufsize -= (c > 0xFF ? 2 : 1)) < 0 || !johab) return 0; diff --git a/c/shiftjis b/c/shiftjis index 6661db3cfcb8fa96a757fb93b7b17f6cccf3e794..d0bc95c71e383b3517e0933def13941930ad4bb3 100644 --- a/c/shiftjis +++ b/c/shiftjis @@ -170,7 +170,7 @@ static unsigned int shiftjis_read(Encoding *e, else { if (c < 0x80) - u = c == 0x5C ? 0x00A5 : c; /* CP932 is as Basic Latin, except for yen */ + u = c == 0x5C ? 0x00A5 : (c == 0x7E ? 0x203E : c); /* CP932 is as Basic Latin, except for yen and overbar */ else if (c == 0x80) u = 0x005C; /* Backslash - a Mac extension */ else if (c < 0xA0) @@ -214,7 +214,7 @@ static int lookup_table(UCS4 u, ShiftJIS_Encoding *sj, int *index, int *table_no { int i; - if (u >= 0x21 && u <= 0x7E && u != 0x5C) /* lower set is ASCII, except... */ + if (u >= 0x21 && u < 0x7E && u != 0x5C) /* lower set is ASCII, except... */ { *table_no = 0; *index = u - 0x21; @@ -228,6 +228,13 @@ static int lookup_table(UCS4 u, ShiftJIS_Encoding *sj, int *index, int *table_no return 1; } + if (u == 0x203E) /* slot 7E is overbar */ + { + *table_no = 0; + *index = 0x7E - 0x21; + return 1; + } + if ((i = encoding_lookup_in_table(u, sj->katakana)) != -1) { *table_no = 1; diff --git a/c/textconv b/c/textconv index 2d8ff26ad88a466432a46f7ba47a9ac68df7889d..2c88180a4971e42df0116b72662cb1a5d06ebca3 100644 --- a/c/textconv +++ b/c/textconv @@ -64,8 +64,8 @@ static int usage(void) static int src_enc = csCurrent; static int dst_enc = csCurrent; -static FILE *in = stdin; -static FILE *out = stdout; +static FILE *in; +static FILE *out; static Encoding *read, *write; static char inbuf[256], outbuf[256]; static unsigned int src_flags, dst_flags; @@ -181,6 +181,15 @@ int main(int argc, char **argv) return 1; } } + else + { + out = stdout; + } + } + else + { + in = stdin; + out = stdout; } if (src_enc == dst_enc) diff --git a/c/unix b/c/unix index 99ec49b0cbbb537463302ffb0383b97d42bf00b0..3a0bc93a43c69212f1c88acff2a39ea7ecb37601 100644 --- a/c/unix +++ b/c/unix @@ -30,6 +30,8 @@ #include "layers_dbg.h" #endif +#include <dirent.h> + #include <string.h> #include <stdio.h> @@ -37,9 +39,12 @@ int encoding__load_map_file(const char *leaf, UCS2 **ptable, int *pn_entries, int *palloc) { + DIR *dir; FILE *fh; int flen; char fname[1024]; + char *slash; + struct dirent *dp; void *table; int n_entries; @@ -56,6 +61,27 @@ int encoding__load_map_file(const char *leaf, UCS2 **ptable, int *pn_entries, in strncat(fname, leaf, sizeof(fname)); fname[sizeof(fname)-1] = 0; + /* We get to search the directory, because the leafname may be a prefix */ + slash = strrchr(fname, '/'); + if (!slash) return 0; + + *slash = '\0'; + slash++; + + dir = opendir(fname); + if (!dir) return 0; + + while ((dp = readdir(dir)) != NULL) { + if (strncmp(dp->d_name, slash, strlen(slash)) == 0) { + *(slash - 1) = '/'; + *slash = '\0'; + strncat(fname, dp->d_name, sizeof(fname)); + break; + } + } + + closedir(dir); + fh = fopen(fname, "rb"); if (!fh) return 0; diff --git a/ccsolaris/Makefile b/ccsolaris/Makefile index dd7e0ce9f27c452f9f28d4ac53081213a46be4d6..df06da45ffb0b0e2ef619af840494687b7c6422f 100644 --- a/ccsolaris/Makefile +++ b/ccsolaris/Makefile @@ -14,9 +14,25 @@ # # Project: Unicode -CC=gcc +ifeq ($(findstring riscos,$(TARGET)),riscos) + GCCSDK_INSTALL_CROSSBIN ?= /home/riscos/cross/bin -CCflags=-funsigned-char + CC = $(wildcard $(GCCSDK_INSTALL_CROSSBIN)/*gcc) + + ifeq ($(findstring module,$(TARGET)),module) + PlatCCflags = -mmodule + endif + + PlatObjs = riscos.o +else + CC = gcc + + PlatObjs = unix.o +endif + +HOST_CC = gcc + +CCflags = -funsigned-char -g -O0 $(PlatCCflags) .c.o:; $(CC) -c -DDEBUG=0 $(CCflags) -o $@ $< @@ -40,7 +56,8 @@ Objects = autojp.o \ enc_system.o \ acorn.o \ combine.o \ - unix.o + debug.o \ + $(PlatObjs) all: ucodelib.a textconv @@ -48,7 +65,10 @@ ucodelib.a: $(Objects) ${AR} r $@ $(Objects) textconv: textconv.o ucodelib.a - ${CC} -o $@ textconv.o ucodelib.a + ${CC} $(CCflags) -o $@ textconv.o ucodelib.a + +mkunictype: mkunictype.c + ${HOST_CC} -o $@ $< clean: @-rm mkunictype textconv