Andrew Hodgkinson · 69a25816 · 69a25816 · 69a25816 · 69a25816 · 69a25816
10 changed files
--- a/VersionNum
+++ b/VersionNum
-/* (0.55)
+/* (0.56)
 *
 * This file is automatically maintained by srccommit, do not edit manually.
- * Last processed by srccommit version: 1.2.
+ * Last processed by srccommit version: 1.1.
 *
 */
-#define Module_MajorVersion_CMHG        0.55
+#define Module_MajorVersion_CMHG        0.56
 #define Module_MinorVersion_CMHG        
-#define Module_Date_CMHG                26 Aug 2005
+#define Module_Date_CMHG                05 Dec 2008
-#define Module_MajorVersion             "0.55"
+#define Module_MajorVersion             "0.56"
-#define Module_Version                  55
+#define Module_Version                  56
 #define Module_MinorVersion             ""
-#define Module_Date                     "26 Aug 2005"
+#define Module_Date                     "05 Dec 2008"
-#define Module_ApplicationDate          "26-Aug-05"
+#define Module_ApplicationDate          "05-Dec-08"
 #define Module_ComponentName            "Unicode"
-#define Module_ComponentPath            "RiscOS/Sources/Lib/Unicode"
+#define Module_ComponentPath            "castle/RiscOS/Sources/Lib/Unicode"
-#define Module_FullVersion              "0.55"
+#define Module_FullVersion              "0.56"
-#define Module_HelpVersion              "0.55 (26 Aug 2005)"
+#define Module_HelpVersion              "0.56 (05 Dec 2008)"
-#define Module_LibraryVersionInfo       "0:55"
+#define Module_LibraryVersionInfo       "0:56"
--- a/c/enc_utf8
+++ b/c/enc_utf8
@@ -78,22 +78,22 @@ static unsigned int utf8_read(Encoding *e,
            }
            else
            {
+                /* Reset the count of expected continuation bytes */
+                ue->count = 0;
                if (ucs_out)
                    if (ucs_out(handle, 0xFFFD))
                    {
-                        /* Character has been used, so ensure its counted */
+                        /* Do not consume the invalid continuation byte */
-                        count--;
                        break;
                    }
-                ue->count = 0;
                goto retry;
            }
        }
        else
        {
-            if (c <= 0x80)
+            if (c < 0x80)
                u = c;
            else if (c < 0xC0 || c >= 0xFE)
                u = 0xFFFD;
@@ -115,10 +115,14 @@ static unsigned int utf8_read(Encoding *e,
        ue->first = 0;
+        /* Reject surrogates and FFFE/FFFF */
+        if ((0xD800 <= u && u <= 0xE000) || u == 0xFFFE || u == 0xFFFF)
+            u = 0xFFFD;
        if (ucs_out)
            if (ucs_out(handle, u))
            {
-                /* Character has been used, so ensure its counted */
+                /* Character has been used, so ensure it's counted */
                count--;
                break;
            }

--- a/c/encoding
+++ b/c/encoding
@@ -64,20 +64,20 @@ static char version[] = "Unicode library " Module_MajorVersion " " Module_Date "
 static EncList enclist[] =
 {
 {   csASCII /* 3 */, 1, "/US-ASCII/", lang_ENGLISH, &enc_ascii, NULL, NULL },
- {   csISOLatin1 /* 4 */, 1, "/ISO-8859-1/ISO-IR-100/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x41\x1B\x2E\x42\x1B\x2F\x50", NULL }, /* Select G1 Latin-1, G2 Latin-2, G3 supplement */
+ {   csISOLatin1 /* 4 */, 1, "/ISO-8859-1/ISO-IR-100/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x41"/*\x1B\x2E\x42\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-1, G2 Latin-2, G3 supplement */
- {   csISOLatin2 /* 5 */, 1, "/ISO-8859-2/ISO-IR-101/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x42\x1B\x2E\x41\x1B\x2F\x50", NULL }, /* Select G1 Latin-2, G2 Latin-1, G3 supplement */
+ {   csISOLatin2 /* 5 */, 1, "/ISO-8859-2/ISO-IR-101/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x42"/*\x1B\x2E\x41\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-2, G2 Latin-1, G3 supplement */
 {   csISOLatin3 /* 6 */, 1, "/ISO-8859-3/ISO-IR-109/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x43", NULL },	                /* Select Latin-3 right half */
 {   csISOLatin4 /* 7 */, 1, "/ISO-8859-4/ISO-IR-110/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x44", NULL },	                /* Select Latin-4 right half */
 {   csISOLatinCyrillic /* 8 */, 1, "/ISO-8859-5/ISO-IR-144/", lang_RUSSIAN, &enc_iso8859, "\x1B\x2D\x4C", NULL },		/* Select Cyrillic right half */
 {   csISOLatinGreek /* 10 */, 1, "/ISO-8859-7/ISO-IR-126/", lang_GREEK, &enc_iso8859, "\x1B\x2D\x46", NULL },		/* Select Greek right half */
 {   csISOLatinHebrew /* 11 */, 1, "/ISO-8859-8/ISO-IR-198/", lang_HEBREW, &enc_iso8859, "\x1B\x2D\x5E", NULL },		/* Select Hebrew right half */
- {   csISOLatin5 /* 12 */, 1, "/ISO-8859-9/ISO-IR-148/", lang_TURKISH, &enc_iso8859, "\x1B\x2D\x4D\x1B\x2E\x42\x1B\x2F\x50", NULL },	/* Select G1 Latin-5, G2 Latin-2, G3 supplement */
+ {   csISOLatin5 /* 12 */, 1, "/ISO-8859-9/ISO-IR-148/", lang_TURKISH, &enc_iso8859, "\x1B\x2D\x4D"/*\x1B\x2E\x42\x1B\x2F\x50"*/, NULL },	/* Select G1 Latin-5, G2 Latin-2, G3 supplement */
- {   csISOLatin6 /* 13 */, 1, "/ISO-8859-10/ISO-IR-157/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x56\x1B\x2E\x58", NULL },	/* Select Latin-6 right half, and Sami supplement as G2 */
+ {   csISOLatin6 /* 13 */, 1, "/ISO-8859-10/ISO-IR-157/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x56"/*\x1B\x2E\x58"*/, NULL },	/* Select Latin-6 right half, and Sami supplement as G2 */
 {   csISOLatinThai, 1, "/ISO-8859-11/ISO-IR-166/", lang_THAI, &enc_iso8859, "\x1B\x2D\x54", NULL },                          /* Select Thai right half */
 {   csISOLatin7, 1, "/ISO-8859-13/ISO-IR-179/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x59", NULL },		                /* Select Baltic Rim right half */
 {   csISOLatin8, 1, "/ISO-8859-14/ISO-IR-199/", lang_IRISH, &enc_iso8859, "\x1B\x2D\x5F", NULL },	                        /* Select Celtic right half */
- {   csISOLatin9, 1, "/ISO-8859-15/ISO-IR-203/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x62\x1B\x2E\x42\x1B\x2F\x50", NULL },  /* Select G1 Latin-9, G2 Latin-2, G3 supplement */
+ {   csISOLatin9, 1, "/ISO-8859-15/ISO-IR-203/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x62"/*\x1B\x2E\x42\x1B\x2F\x50"*/, NULL },  /* Select G1 Latin-9, G2 Latin-2, G3 supplement */
- {   csISOLatin10, 1, "/ISO-8859-16/ISO-IR-226/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x66\x1B\x2E\x41\x1B\x2F\x50", NULL },  /* Select G1 Latin-10, G2 Latin-1, G3 supplement */
+ {   csISOLatin10, 1, "/ISO-8859-16/ISO-IR-226/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x66"/*\x1B\x2E\x41\x1B\x2F\x50"*/, NULL },  /* Select G1 Latin-10, G2 Latin-1, G3 supplement */
 {   csISO6937, 2, "/ISO-IR-156/", lang_ENGLISH, &enc_iso6937, "\x1B\x2D\x52", NULL },                         /* Select ISO6937 right half */
 {   csISO6937DVB, 2, "/X-ISO-6937-DVB/X-DVB/", lang_ENGLISH, &enc_iso6937, NULL, NULL },
 {   csShiftJIS /* 17 */, 2, "/SHIFT_JIS/X-SJIS/", lang_JAPANESE, &enc_shiftjis, NULL, NULL },

--- a/c/iso2022
+++ b/c/iso2022
@@ -29,6 +29,7 @@
 #include <stdio.h>
 #include <string.h>
+#include "charsets.h"
 #include "encpriv.h"
 #include "iso2022.h"
@@ -63,8 +64,24 @@ struct ISO2022_Encoding
    unsigned char tempset;
    ISO2022_Set *oldset;
+    /* Whether escape sequences are disabled
+     *
+     * Value: Meaning:
+     *   0    All escape sequences enabled
+     *   1    Only SS2/3 escape sequences enabled
+     *   2    All escape sequences disabled
+     */
    unsigned char esc_disabled;
+    /* Whether C1 control characters are permitted
+     *
+     * Value: Meaning:
+     *   0    No C1 control characters permitted
+     *   1    Only 0x8E/0x8F permitted
+     *   2    All C1 control characters permitted
+     */
+    unsigned char c1_permitted;
    /* Pending escape commands */
    unsigned char esc_pending;
    unsigned char esc_multi;
@@ -133,7 +150,7 @@ static UCS4 simple_double_next_code_94(ISO2022_Set *s, int c, int invoker, unsig
    if (c == 0x00 || c == 0x5F)
    {
        *sync = 0;
-        return c + 0x20;
+        return invoker == _GL ? c + 0x20 : 0xFFFD;
    }
    if (!*sync)
@@ -147,8 +164,6 @@ static UCS4 simple_double_next_code_94(ISO2022_Set *s, int c, int invoker, unsig
        *sync = 0;
        return u;
    }
-    NOT_USED(invoker);
 }
 static UCS4 null_double_next_code(ISO2022_Set *s, int c, int invoker, unsigned char *sync)
@@ -376,6 +391,7 @@ static int iso2022_reset(Encoding *e, int for_encoding)
    i->CR_s = C1;
    i->GR_s = G1;
+    i->c1_permitted = 2;
    i->esc_disabled = 0;
    i->esc_pending = i->esc_revision = 0;
    i->tempset = 0;
@@ -391,18 +407,41 @@ static int iso2022_reset(Encoding *e, int for_encoding)
    iso2022_select_set(i, C0, 32, C0_ISO646);
    iso2022_select_set(i, C1, 32+1, C1_ISO6429);
-    /* ISO8859 and EUC variants of IOS2022 require preloading with
+    /* ISO8859 and EUC variants of ISO2022 require preloading with
       escape sequences to get the appropriate tables */
    if (e->list_entry->preload)
    {
+        char euc = 0;
 	unsigned int n = strlen(e->list_entry->preload);
 	if (n != e->read(e, NULL, (unsigned char *)e->list_entry->preload, n, NULL))
 	    return 0;
-        /* if we've preloaded then we need to disable further escape
+        if (e->list_entry->identifier == csEUCPkdFmtJapanese ||
-         * sequences otherwise stray control sequences (eg 8E, 8F)
+                /* e->list_entry->identifier == csKSC56011987 || */
-         * will try and switch tables */
+                e->list_entry->identifier == csEUCKR ||
+                e->list_entry->identifier == csGB2312)
+        {
+            euc = 1;
+        }
+        /* If we've preloaded and we're not handling an EUC variant
+         * then we need to disable further escape sequences otherwise
+         * stray control sequences (eg 8E, 8F) will try and switch tables.
+         *
+         * If we're handling an EUC variant which has loaded tables into
+         * G2 and G3, then SS2/SS3 are permitted. */
+        if (euc && ((simple_set *)i->Set[G2])->table &&
+                ((simple_set *)i->Set[G3])->table)
+        {
            i->esc_disabled = 1;
+            i->c1_permitted = 1;
+        }
+        else
+        {
+            i->esc_disabled = 2;
+            if (euc)
+                i->c1_permitted = 0;
+        }
    }
    if (for_encoding != encoding_READ)
@@ -602,6 +641,12 @@ static unsigned int iso2022_read(EncodingPriv *e,
 	{
 	    u = 0xFFFD;
 	}
+        /* or illegal continuation bytes */
+        else if ((i->sync[_GL] && (c < 0x20 || c > 0x7F)) ||
+                 (i->sync[_GR] && (c < 0xA0)))
+        {
+            u = 0xFFFD;
+        }
 	else if (i->esc_pending)
 	{
            u = iso2022_esc_cont(i, c);
@@ -619,9 +664,17 @@ static unsigned int iso2022_read(EncodingPriv *e,
        else if (c < 0xA0)
        {
            i->sync[_GL] = i->sync[_GR] = 0;
+            if (i->c1_permitted == 2 ||
+                    (i->c1_permitted == 1 && (c == 0x8E || c == 0x8F)))
+            {
                u = i->CR->next_code(i->CR, c - 0x80, _CR, NULL);
            }
            else
+            {
+                u = 0xFFFD;
+            }
+        }
+        else
        {
            i->sync[_GL] = 0;
 	    u = i->GR->next_code(i->GR, c - 0xA0, _GR, i->sync + _GR);
@@ -635,9 +688,9 @@ static unsigned int iso2022_read(EncodingPriv *e,
                       break;
            case 0x0E: if (!i->esc_disabled) { iso2022_ls(i, G1); continue; }
                       break;
-            case 0x8E: if (!i->esc_disabled) { iso2022_ss(i, G2); continue; }
+            case 0x8E: if (i->esc_disabled < 2) { iso2022_ss(i, G2); continue; }
                       break;
-            case 0x8F: if (!i->esc_disabled) { iso2022_ss(i, G3); continue; }
+            case 0x8F: if (i->esc_disabled < 2) { iso2022_ss(i, G3); continue; }
                       break;
        }
@@ -874,6 +927,8 @@ static int iso2022_scan_sets(ISO2022_Encoding *enc, UCS4 u, int *index, int *tab
        /* UNIDBG(("scan_table: set %d table %p\n", set, setptr->table)); */
+        if (setptr->table == NULL) continue;
 	if ((i = encoding_lookup_in_table(u, setptr->table)) != -1)
 	{
 	    *index = i;
@@ -904,7 +959,9 @@ static int iso2022_write_euc(EncodingPriv *e, UCS4 u, unsigned char **euc, int *
 retry:
    /* control chars */
-    if (u < 0x0021)
+    if (u < 0x0021 || u == 0x007F)
+        buf[out++] = u;
+    else if ((enc->c1_permitted == 2 && 0x0080 <= u && u <= 0x009F))
        buf[out++] = u;
    /* main chars */

--- a/c/iso6937
+++ b/c/iso6937
@@ -351,11 +351,13 @@ static UCS4 iso6937_combine(Accent a, unsigned char letter)
 static int iso6937_find_accent_pair(UCS4 u)
 {
-    for (int a = 1; a <= 15; a++)
+    int a, i;
+    for (a = 1; a <= 15; a++)
    {
        if (iso6937_combination_table[a].combination)
        {
-            for (int i = 0; i < iso6937_combination_table[a].ncombinations; i++)
+            for (i = 0; i < iso6937_combination_table[a].ncombinations; i++)
            {
                if (iso6937_combination_table[a].combination[i].u == u)
                {

--- a/c/johab
+++ b/c/johab
@@ -113,10 +113,10 @@ static UCS4 hangul_to_ucs(unsigned int c1, unsigned int c2)
            /* Hangul is --X */
            static const unsigned char final_only[28] =
            {
-                   0, 0,    0, 0x33,    0, 0x35, 0x36,    0,
+                   0,    0,    0, 0x33,    0, 0x35, 0x36,
-                   0, 0, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
+                   0,    0, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E,
-                0x40, 0,    0,    0, 0x44,    0,    0,    0,
+                0x3F, 0x40,    0,    0, 0x44,    0,    0,
-                   0, 0,    0,    0
+                   0,    0,    0,    0,    0,    0,    0
            };
            u = 0x3100 + final_only[final];
@@ -247,8 +247,10 @@ static unsigned int johab_read(Encoding *e,
        }
        else
        {
-            if (c < 0x80)
+            if (c < 0x80 && c != 0x5C) /* Standard ASCII... */
                u = c;
+            else if (c == 0x5C) /* ...except 0x5C, which maps to Won */
+                u = 0x20A9;
            else if ((c >= 0x84 && c <= 0xD3) ||
                     (c >= 0xD8 && c <= 0xDE) ||
                     (c >= 0xE0 && c <= 0xF9))
@@ -330,13 +332,15 @@ static int ucs_jamo_to_johab(UCS4 u)
 static int johab_write(EncodingPriv *e, UCS4 u, unsigned char **johab, int *bufsize)
 {
    Johab_Encoding *je = (Johab_Encoding *) e;
-    int c = '?';
+    int c = 0xFFFD;
    if (u == NULL_UCS4)
 	return 0;
-    if (u <= 0x7F) /* Basic Latin */
+    if (u <= 0x7F && u != 0x5C) /* Basic Latin */
        c = u;
+    else if (u == 0x20A9) /* Won Sign, mapped to 0x5C */
+        c = 0x5C;
    else if (u >= 0xAC00 && u <= 0xD7A3) /* Hangul syllables */
        c = ucs_hangul_to_johab(u);
    else if (u >= 0x3131 && u <= 0x3163) /* Modern Jamo */
@@ -370,6 +374,11 @@ static int johab_write(EncodingPriv *e, UCS4 u, unsigned char **johab, int *bufs
        }
    }
+    if (c == 0xFFFD && e->for_encoding == encoding_WRITE_STRICT)
+        return -1;
+    else if (c == 0xFFFD)
+        c = '?';
    if ((*bufsize -= (c > 0xFF ? 2 : 1)) < 0 || !johab)
 	return 0;

--- a/c/shiftjis
+++ b/c/shiftjis
@@ -170,7 +170,7 @@ static unsigned int shiftjis_read(Encoding *e,
        else
        {
            if (c < 0x80)
-                u = c == 0x5C ? 0x00A5 : c; /* CP932 is as Basic Latin, except for yen */
+                u = c == 0x5C ? 0x00A5 : (c == 0x7E ? 0x203E : c); /* CP932 is as Basic Latin, except for yen and overbar */
            else if (c == 0x80)
                u = 0x005C; /* Backslash - a Mac extension */
            else if (c < 0xA0)
@@ -214,7 +214,7 @@ static int lookup_table(UCS4 u, ShiftJIS_Encoding *sj, int *index, int *table_no
 {
    int i;
-    if (u >= 0x21 && u <= 0x7E && u != 0x5C)  /* lower set is ASCII, except... */
+    if (u >= 0x21 && u < 0x7E && u != 0x5C)  /* lower set is ASCII, except... */
    {
        *table_no = 0;
        *index = u - 0x21;
@@ -228,6 +228,13 @@ static int lookup_table(UCS4 u, ShiftJIS_Encoding *sj, int *index, int *table_no
        return 1;
    }
+    if (u == 0x203E) /* slot 7E is overbar */
+    {
+        *table_no = 0;
+        *index = 0x7E - 0x21;
+        return 1;
+    }
    if ((i = encoding_lookup_in_table(u, sj->katakana)) != -1)
    {
 	*table_no = 1;

--- a/c/textconv
+++ b/c/textconv
@@ -64,8 +64,8 @@ static int usage(void)
 static int src_enc = csCurrent;
 static int dst_enc = csCurrent;
-static FILE *in = stdin;
+static FILE *in;
-static FILE *out = stdout;
+static FILE *out;
 static Encoding *read, *write;
 static char inbuf[256], outbuf[256];
 static unsigned int src_flags, dst_flags;
@@ -181,6 +181,15 @@ int main(int argc, char **argv)
                return 1;
            }
        }
+        else
+        {
+            out = stdout;
+        }
+    }
+    else
+    {
+        in = stdin;
+        out = stdout;
    }
    if (src_enc == dst_enc)

--- a/c/unix
+++ b/c/unix
@@ -30,6 +30,8 @@
 #include "layers_dbg.h"
 #endif
+#include <dirent.h>
 #include <string.h>
 #include <stdio.h>
@@ -37,9 +39,12 @@
 int encoding__load_map_file(const char *leaf, UCS2 **ptable, int *pn_entries, int *palloc)
 {
+    DIR *dir;
    FILE *fh;
    int flen;
    char fname[1024];
+    char *slash;
+    struct dirent *dp;
    void *table;
    int n_entries;
@@ -56,6 +61,27 @@ int encoding__load_map_file(const char *leaf, UCS2 **ptable, int *pn_entries, in
    strncat(fname, leaf, sizeof(fname));
    fname[sizeof(fname)-1] = 0;
+    /* We get to search the directory, because the leafname may be a prefix */
+    slash = strrchr(fname, '/');
+    if (!slash) return 0;
+    *slash = '\0';
+    slash++;
+    dir = opendir(fname);
+    if (!dir) return 0;
+    while ((dp = readdir(dir)) != NULL) {
+        if (strncmp(dp->d_name, slash, strlen(slash)) == 0) {
+            *(slash - 1) = '/';
+            *slash = '\0';
+            strncat(fname, dp->d_name, sizeof(fname));
+            break;
+        }
+    }
+    closedir(dir);
    fh = fopen(fname, "rb");
    if (!fh)
 	return 0;

--- a/ccsolaris/Makefile
+++ b/ccsolaris/Makefile
@@ -14,9 +14,25 @@
 #
 # Project:   Unicode
-CC=gcc
+ifeq ($(findstring riscos,$(TARGET)),riscos)
+	GCCSDK_INSTALL_CROSSBIN ?= /home/riscos/cross/bin
-CCflags=-funsigned-char
+	CC = $(wildcard $(GCCSDK_INSTALL_CROSSBIN)/*gcc)
+	ifeq ($(findstring module,$(TARGET)),module)
+	PlatCCflags = -mmodule
+	endif
+	PlatObjs = riscos.o
+else
+	CC = gcc
+	PlatObjs = unix.o
+endif
+HOST_CC = gcc
+CCflags = -funsigned-char -g -O0 $(PlatCCflags)
 .c.o:;	$(CC) -c -DDEBUG=0 $(CCflags) -o $@ $<
@@ -40,7 +56,8 @@ Objects = autojp.o \
 	enc_system.o \
 	acorn.o \
 	combine.o \
-	unix.o
+	debug.o \
+	$(PlatObjs)
 all:	ucodelib.a textconv
@@ -48,7 +65,10 @@ ucodelib.a: $(Objects)
 	${AR} r $@ $(Objects)
 textconv: textconv.o ucodelib.a
-	${CC} -o $@ textconv.o ucodelib.a
+	${CC} $(CCflags) -o $@ textconv.o ucodelib.a
+mkunictype: mkunictype.c
+	${HOST_CC} -o $@ $<
 clean:
 	@-rm mkunictype textconv