Commit 69a25816 authored by Andrew Hodgkinson's avatar Andrew Hodgkinson
Browse files

Fix bugs and inconsistencies in encoding handlers.

  Fix inconsistency in handling illegal byte sequences.
  Convert surrogate codepoints and U+FFFE, U+FFFF to U+FFFD.
  Also, a few extra mappings.
Detail:
  enc_utf8.c: 0x80 is a continuation byte. Map stray ones to U+FFFD.
              Reset the count of expected continuation bytes to 0 when
              encountering illegal byte sequences. Previously, if the character
              callback returned non-zero, this count would not be reset, thus
              leaving the codec in an inconsistent state. Additionally, we no
              longer consume the illegal continuation byte: instead, we process
              it as a start byte next time round.
  encoding.c: Do not load extension tables for ISO-8859-{1,2,9,10,15,16}
              If these are needed, it's probably best that different charset
              names are used rather than overloading 8859-n.
  iso2022.c:  Permit SS2/3 escape sequences for EUC encode/decode.
              Disable C1 characters for EUC encode/decode.
              Fix G94x94 read function to handle GR 0xA0/0xFF correctly.
              Fix writing of C1 controls for 8859-n.
              Prevent dereference of NULL pointer when scanning tables.
  iso6937.c:  Replace C99 loop iterators with C89 friendly versions.
  johab.c:    Fix final_only lookup table to have entries in the right place.
              Map 0x5C to the Won sign.
              Actually pay attention to encoding_WRITE_STRICT.
  shiftjis.c: Map 0x7E to overbar rather than tilde.
  textconv.c: Fix static assignment of stdin/stdout.
  unix.c:     Perform wildcard lookup of mapping tables.
  ccsolaris/Makefile: Modify for use with GCCSDK
Admin:
  Tested with the Iconv module testsuite.
Author:
  John-Mark Bell


Version 0.56. Tagged as 'Unicode-0_56'
parent a3d2481a
/* (0.55)
/* (0.56)
*
* This file is automatically maintained by srccommit, do not edit manually.
* Last processed by srccommit version: 1.2.
* Last processed by srccommit version: 1.1.
*
*/
#define Module_MajorVersion_CMHG 0.55
#define Module_MajorVersion_CMHG 0.56
#define Module_MinorVersion_CMHG
#define Module_Date_CMHG 26 Aug 2005
#define Module_Date_CMHG 05 Dec 2008
#define Module_MajorVersion "0.55"
#define Module_Version 55
#define Module_MajorVersion "0.56"
#define Module_Version 56
#define Module_MinorVersion ""
#define Module_Date "26 Aug 2005"
#define Module_Date "05 Dec 2008"
#define Module_ApplicationDate "26-Aug-05"
#define Module_ApplicationDate "05-Dec-08"
#define Module_ComponentName "Unicode"
#define Module_ComponentPath "RiscOS/Sources/Lib/Unicode"
#define Module_ComponentPath "castle/RiscOS/Sources/Lib/Unicode"
#define Module_FullVersion "0.55"
#define Module_HelpVersion "0.55 (26 Aug 2005)"
#define Module_LibraryVersionInfo "0:55"
#define Module_FullVersion "0.56"
#define Module_HelpVersion "0.56 (05 Dec 2008)"
#define Module_LibraryVersionInfo "0:56"
......@@ -78,22 +78,22 @@ static unsigned int utf8_read(Encoding *e,
}
else
{
/* Reset the count of expected continuation bytes */
ue->count = 0;
if (ucs_out)
if (ucs_out(handle, 0xFFFD))
{
/* Character has been used, so ensure its counted */
count--;
/* Do not consume the invalid continuation byte */
break;
}
ue->count = 0;
goto retry;
}
}
else
{
if (c <= 0x80)
if (c < 0x80)
u = c;
else if (c < 0xC0 || c >= 0xFE)
u = 0xFFFD;
......@@ -115,10 +115,14 @@ static unsigned int utf8_read(Encoding *e,
ue->first = 0;
/* Reject surrogates and FFFE/FFFF */
if ((0xD800 <= u && u <= 0xE000) || u == 0xFFFE || u == 0xFFFF)
u = 0xFFFD;
if (ucs_out)
if (ucs_out(handle, u))
{
/* Character has been used, so ensure its counted */
/* Character has been used, so ensure it's counted */
count--;
break;
}
......
......@@ -64,20 +64,20 @@ static char version[] = "Unicode library " Module_MajorVersion " " Module_Date "
static EncList enclist[] =
{
{ csASCII /* 3 */, 1, "/US-ASCII/", lang_ENGLISH, &enc_ascii, NULL, NULL },
{ csISOLatin1 /* 4 */, 1, "/ISO-8859-1/ISO-IR-100/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x41\x1B\x2E\x42\x1B\x2F\x50", NULL }, /* Select G1 Latin-1, G2 Latin-2, G3 supplement */
{ csISOLatin2 /* 5 */, 1, "/ISO-8859-2/ISO-IR-101/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x42\x1B\x2E\x41\x1B\x2F\x50", NULL }, /* Select G1 Latin-2, G2 Latin-1, G3 supplement */
{ csISOLatin1 /* 4 */, 1, "/ISO-8859-1/ISO-IR-100/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x41"/*\x1B\x2E\x42\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-1, G2 Latin-2, G3 supplement */
{ csISOLatin2 /* 5 */, 1, "/ISO-8859-2/ISO-IR-101/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x42"/*\x1B\x2E\x41\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-2, G2 Latin-1, G3 supplement */
{ csISOLatin3 /* 6 */, 1, "/ISO-8859-3/ISO-IR-109/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x43", NULL }, /* Select Latin-3 right half */
{ csISOLatin4 /* 7 */, 1, "/ISO-8859-4/ISO-IR-110/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x44", NULL }, /* Select Latin-4 right half */
{ csISOLatinCyrillic /* 8 */, 1, "/ISO-8859-5/ISO-IR-144/", lang_RUSSIAN, &enc_iso8859, "\x1B\x2D\x4C", NULL }, /* Select Cyrillic right half */
{ csISOLatinGreek /* 10 */, 1, "/ISO-8859-7/ISO-IR-126/", lang_GREEK, &enc_iso8859, "\x1B\x2D\x46", NULL }, /* Select Greek right half */
{ csISOLatinHebrew /* 11 */, 1, "/ISO-8859-8/ISO-IR-198/", lang_HEBREW, &enc_iso8859, "\x1B\x2D\x5E", NULL }, /* Select Hebrew right half */
{ csISOLatin5 /* 12 */, 1, "/ISO-8859-9/ISO-IR-148/", lang_TURKISH, &enc_iso8859, "\x1B\x2D\x4D\x1B\x2E\x42\x1B\x2F\x50", NULL }, /* Select G1 Latin-5, G2 Latin-2, G3 supplement */
{ csISOLatin6 /* 13 */, 1, "/ISO-8859-10/ISO-IR-157/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x56\x1B\x2E\x58", NULL }, /* Select Latin-6 right half, and Sami supplement as G2 */
{ csISOLatin5 /* 12 */, 1, "/ISO-8859-9/ISO-IR-148/", lang_TURKISH, &enc_iso8859, "\x1B\x2D\x4D"/*\x1B\x2E\x42\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-5, G2 Latin-2, G3 supplement */
{ csISOLatin6 /* 13 */, 1, "/ISO-8859-10/ISO-IR-157/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x56"/*\x1B\x2E\x58"*/, NULL }, /* Select Latin-6 right half, and Sami supplement as G2 */
{ csISOLatinThai, 1, "/ISO-8859-11/ISO-IR-166/", lang_THAI, &enc_iso8859, "\x1B\x2D\x54", NULL }, /* Select Thai right half */
{ csISOLatin7, 1, "/ISO-8859-13/ISO-IR-179/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x59", NULL }, /* Select Baltic Rim right half */
{ csISOLatin8, 1, "/ISO-8859-14/ISO-IR-199/", lang_IRISH, &enc_iso8859, "\x1B\x2D\x5F", NULL }, /* Select Celtic right half */
{ csISOLatin9, 1, "/ISO-8859-15/ISO-IR-203/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x62\x1B\x2E\x42\x1B\x2F\x50", NULL }, /* Select G1 Latin-9, G2 Latin-2, G3 supplement */
{ csISOLatin10, 1, "/ISO-8859-16/ISO-IR-226/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x66\x1B\x2E\x41\x1B\x2F\x50", NULL }, /* Select G1 Latin-10, G2 Latin-1, G3 supplement */
{ csISOLatin9, 1, "/ISO-8859-15/ISO-IR-203/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x62"/*\x1B\x2E\x42\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-9, G2 Latin-2, G3 supplement */
{ csISOLatin10, 1, "/ISO-8859-16/ISO-IR-226/", lang_ENGLISH, &enc_iso8859, "\x1B\x2D\x66"/*\x1B\x2E\x41\x1B\x2F\x50"*/, NULL }, /* Select G1 Latin-10, G2 Latin-1, G3 supplement */
{ csISO6937, 2, "/ISO-IR-156/", lang_ENGLISH, &enc_iso6937, "\x1B\x2D\x52", NULL }, /* Select ISO6937 right half */
{ csISO6937DVB, 2, "/X-ISO-6937-DVB/X-DVB/", lang_ENGLISH, &enc_iso6937, NULL, NULL },
{ csShiftJIS /* 17 */, 2, "/SHIFT_JIS/X-SJIS/", lang_JAPANESE, &enc_shiftjis, NULL, NULL },
......
......@@ -29,6 +29,7 @@
#include <stdio.h>
#include <string.h>
#include "charsets.h"
#include "encpriv.h"
#include "iso2022.h"
......@@ -63,8 +64,24 @@ struct ISO2022_Encoding
unsigned char tempset;
ISO2022_Set *oldset;
/* Whether escape sequences are disabled
*
* Value: Meaning:
* 0 All escape sequences enabled
* 1 Only SS2/3 escape sequences enabled
* 2 All escape sequences disabled
*/
unsigned char esc_disabled;
/* Whether C1 control characters are permitted
*
* Value: Meaning:
* 0 No C1 control characters permitted
* 1 Only 0x8E/0x8F permitted
* 2 All C1 control characters permitted
*/
unsigned char c1_permitted;
/* Pending escape commands */
unsigned char esc_pending;
unsigned char esc_multi;
......@@ -133,7 +150,7 @@ static UCS4 simple_double_next_code_94(ISO2022_Set *s, int c, int invoker, unsig
if (c == 0x00 || c == 0x5F)
{
*sync = 0;
return c + 0x20;
return invoker == _GL ? c + 0x20 : 0xFFFD;
}
if (!*sync)
......@@ -147,8 +164,6 @@ static UCS4 simple_double_next_code_94(ISO2022_Set *s, int c, int invoker, unsig
*sync = 0;
return u;
}
NOT_USED(invoker);
}
static UCS4 null_double_next_code(ISO2022_Set *s, int c, int invoker, unsigned char *sync)
......@@ -376,6 +391,7 @@ static int iso2022_reset(Encoding *e, int for_encoding)
i->CR_s = C1;
i->GR_s = G1;
i->c1_permitted = 2;
i->esc_disabled = 0;
i->esc_pending = i->esc_revision = 0;
i->tempset = 0;
......@@ -391,18 +407,41 @@ static int iso2022_reset(Encoding *e, int for_encoding)
iso2022_select_set(i, C0, 32, C0_ISO646);
iso2022_select_set(i, C1, 32+1, C1_ISO6429);
/* ISO8859 and EUC variants of IOS2022 require preloading with
/* ISO8859 and EUC variants of ISO2022 require preloading with
escape sequences to get the appropriate tables */
if (e->list_entry->preload)
{
char euc = 0;
unsigned int n = strlen(e->list_entry->preload);
if (n != e->read(e, NULL, (unsigned char *)e->list_entry->preload, n, NULL))
return 0;
/* if we've preloaded then we need to disable further escape
* sequences otherwise stray control sequences (eg 8E, 8F)
* will try and switch tables */
if (e->list_entry->identifier == csEUCPkdFmtJapanese ||
/* e->list_entry->identifier == csKSC56011987 || */
e->list_entry->identifier == csEUCKR ||
e->list_entry->identifier == csGB2312)
{
euc = 1;
}
/* If we've preloaded and we're not handling an EUC variant
* then we need to disable further escape sequences otherwise
* stray control sequences (eg 8E, 8F) will try and switch tables.
*
* If we're handling an EUC variant which has loaded tables into
* G2 and G3, then SS2/SS3 are permitted. */
if (euc && ((simple_set *)i->Set[G2])->table &&
((simple_set *)i->Set[G3])->table)
{
i->esc_disabled = 1;
i->c1_permitted = 1;
}
else
{
i->esc_disabled = 2;
if (euc)
i->c1_permitted = 0;
}
}
if (for_encoding != encoding_READ)
......@@ -602,6 +641,12 @@ static unsigned int iso2022_read(EncodingPriv *e,
{
u = 0xFFFD;
}
/* or illegal continuation bytes */
else if ((i->sync[_GL] && (c < 0x20 || c > 0x7F)) ||
(i->sync[_GR] && (c < 0xA0)))
{
u = 0xFFFD;
}
else if (i->esc_pending)
{
u = iso2022_esc_cont(i, c);
......@@ -619,9 +664,17 @@ static unsigned int iso2022_read(EncodingPriv *e,
else if (c < 0xA0)
{
i->sync[_GL] = i->sync[_GR] = 0;
if (i->c1_permitted == 2 ||
(i->c1_permitted == 1 && (c == 0x8E || c == 0x8F)))
{
u = i->CR->next_code(i->CR, c - 0x80, _CR, NULL);
}
else
{
u = 0xFFFD;
}
}
else
{
i->sync[_GL] = 0;
u = i->GR->next_code(i->GR, c - 0xA0, _GR, i->sync + _GR);
......@@ -635,9 +688,9 @@ static unsigned int iso2022_read(EncodingPriv *e,
break;
case 0x0E: if (!i->esc_disabled) { iso2022_ls(i, G1); continue; }
break;
case 0x8E: if (!i->esc_disabled) { iso2022_ss(i, G2); continue; }
case 0x8E: if (i->esc_disabled < 2) { iso2022_ss(i, G2); continue; }
break;
case 0x8F: if (!i->esc_disabled) { iso2022_ss(i, G3); continue; }
case 0x8F: if (i->esc_disabled < 2) { iso2022_ss(i, G3); continue; }
break;
}
......@@ -874,6 +927,8 @@ static int iso2022_scan_sets(ISO2022_Encoding *enc, UCS4 u, int *index, int *tab
/* UNIDBG(("scan_table: set %d table %p\n", set, setptr->table)); */
if (setptr->table == NULL) continue;
if ((i = encoding_lookup_in_table(u, setptr->table)) != -1)
{
*index = i;
......@@ -904,7 +959,9 @@ static int iso2022_write_euc(EncodingPriv *e, UCS4 u, unsigned char **euc, int *
retry:
/* control chars */
if (u < 0x0021)
if (u < 0x0021 || u == 0x007F)
buf[out++] = u;
else if ((enc->c1_permitted == 2 && 0x0080 <= u && u <= 0x009F))
buf[out++] = u;
/* main chars */
......
......@@ -351,11 +351,13 @@ static UCS4 iso6937_combine(Accent a, unsigned char letter)
static int iso6937_find_accent_pair(UCS4 u)
{
for (int a = 1; a <= 15; a++)
int a, i;
for (a = 1; a <= 15; a++)
{
if (iso6937_combination_table[a].combination)
{
for (int i = 0; i < iso6937_combination_table[a].ncombinations; i++)
for (i = 0; i < iso6937_combination_table[a].ncombinations; i++)
{
if (iso6937_combination_table[a].combination[i].u == u)
{
......
......@@ -113,10 +113,10 @@ static UCS4 hangul_to_ucs(unsigned int c1, unsigned int c2)
/* Hangul is --X */
static const unsigned char final_only[28] =
{
0, 0, 0, 0x33, 0, 0x35, 0x36, 0,
0, 0, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
0x40, 0, 0, 0, 0x44, 0, 0, 0,
0, 0, 0, 0
0, 0, 0, 0x33, 0, 0x35, 0x36,
0, 0, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E,
0x3F, 0x40, 0, 0, 0x44, 0, 0,
0, 0, 0, 0, 0, 0, 0
};
u = 0x3100 + final_only[final];
......@@ -247,8 +247,10 @@ static unsigned int johab_read(Encoding *e,
}
else
{
if (c < 0x80)
if (c < 0x80 && c != 0x5C) /* Standard ASCII... */
u = c;
else if (c == 0x5C) /* ...except 0x5C, which maps to Won */
u = 0x20A9;
else if ((c >= 0x84 && c <= 0xD3) ||
(c >= 0xD8 && c <= 0xDE) ||
(c >= 0xE0 && c <= 0xF9))
......@@ -330,13 +332,15 @@ static int ucs_jamo_to_johab(UCS4 u)
static int johab_write(EncodingPriv *e, UCS4 u, unsigned char **johab, int *bufsize)
{
Johab_Encoding *je = (Johab_Encoding *) e;
int c = '?';
int c = 0xFFFD;
if (u == NULL_UCS4)
return 0;
if (u <= 0x7F) /* Basic Latin */
if (u <= 0x7F && u != 0x5C) /* Basic Latin */
c = u;
else if (u == 0x20A9) /* Won Sign, mapped to 0x5C */
c = 0x5C;
else if (u >= 0xAC00 && u <= 0xD7A3) /* Hangul syllables */
c = ucs_hangul_to_johab(u);
else if (u >= 0x3131 && u <= 0x3163) /* Modern Jamo */
......@@ -370,6 +374,11 @@ static int johab_write(EncodingPriv *e, UCS4 u, unsigned char **johab, int *bufs
}
}
if (c == 0xFFFD && e->for_encoding == encoding_WRITE_STRICT)
return -1;
else if (c == 0xFFFD)
c = '?';
if ((*bufsize -= (c > 0xFF ? 2 : 1)) < 0 || !johab)
return 0;
......
......@@ -170,7 +170,7 @@ static unsigned int shiftjis_read(Encoding *e,
else
{
if (c < 0x80)
u = c == 0x5C ? 0x00A5 : c; /* CP932 is as Basic Latin, except for yen */
u = c == 0x5C ? 0x00A5 : (c == 0x7E ? 0x203E : c); /* CP932 is as Basic Latin, except for yen and overbar */
else if (c == 0x80)
u = 0x005C; /* Backslash - a Mac extension */
else if (c < 0xA0)
......@@ -214,7 +214,7 @@ static int lookup_table(UCS4 u, ShiftJIS_Encoding *sj, int *index, int *table_no
{
int i;
if (u >= 0x21 && u <= 0x7E && u != 0x5C) /* lower set is ASCII, except... */
if (u >= 0x21 && u < 0x7E && u != 0x5C) /* lower set is ASCII, except... */
{
*table_no = 0;
*index = u - 0x21;
......@@ -228,6 +228,13 @@ static int lookup_table(UCS4 u, ShiftJIS_Encoding *sj, int *index, int *table_no
return 1;
}
if (u == 0x203E) /* slot 7E is overbar */
{
*table_no = 0;
*index = 0x7E - 0x21;
return 1;
}
if ((i = encoding_lookup_in_table(u, sj->katakana)) != -1)
{
*table_no = 1;
......
......@@ -64,8 +64,8 @@ static int usage(void)
static int src_enc = csCurrent;
static int dst_enc = csCurrent;
static FILE *in = stdin;
static FILE *out = stdout;
static FILE *in;
static FILE *out;
static Encoding *read, *write;
static char inbuf[256], outbuf[256];
static unsigned int src_flags, dst_flags;
......@@ -181,6 +181,15 @@ int main(int argc, char **argv)
return 1;
}
}
else
{
out = stdout;
}
}
else
{
in = stdin;
out = stdout;
}
if (src_enc == dst_enc)
......
......@@ -30,6 +30,8 @@
#include "layers_dbg.h"
#endif
#include <dirent.h>
#include <string.h>
#include <stdio.h>
......@@ -37,9 +39,12 @@
int encoding__load_map_file(const char *leaf, UCS2 **ptable, int *pn_entries, int *palloc)
{
DIR *dir;
FILE *fh;
int flen;
char fname[1024];
char *slash;
struct dirent *dp;
void *table;
int n_entries;
......@@ -56,6 +61,27 @@ int encoding__load_map_file(const char *leaf, UCS2 **ptable, int *pn_entries, in
strncat(fname, leaf, sizeof(fname));
fname[sizeof(fname)-1] = 0;
/* We get to search the directory, because the leafname may be a prefix */
slash = strrchr(fname, '/');
if (!slash) return 0;
*slash = '\0';
slash++;
dir = opendir(fname);
if (!dir) return 0;
while ((dp = readdir(dir)) != NULL) {
if (strncmp(dp->d_name, slash, strlen(slash)) == 0) {
*(slash - 1) = '/';
*slash = '\0';
strncat(fname, dp->d_name, sizeof(fname));
break;
}
}
closedir(dir);
fh = fopen(fname, "rb");
if (!fh)
return 0;
......
......@@ -14,9 +14,25 @@
#
# Project: Unicode
CC=gcc
ifeq ($(findstring riscos,$(TARGET)),riscos)
GCCSDK_INSTALL_CROSSBIN ?= /home/riscos/cross/bin
CCflags=-funsigned-char
CC = $(wildcard $(GCCSDK_INSTALL_CROSSBIN)/*gcc)
ifeq ($(findstring module,$(TARGET)),module)
PlatCCflags = -mmodule
endif
PlatObjs = riscos.o
else
CC = gcc
PlatObjs = unix.o
endif
HOST_CC = gcc
CCflags = -funsigned-char -g -O0 $(PlatCCflags)
.c.o:; $(CC) -c -DDEBUG=0 $(CCflags) -o $@ $<
......@@ -40,7 +56,8 @@ Objects = autojp.o \
enc_system.o \
acorn.o \
combine.o \
unix.o
debug.o \
$(PlatObjs)
all: ucodelib.a textconv
......@@ -48,7 +65,10 @@ ucodelib.a: $(Objects)
${AR} r $@ $(Objects)
textconv: textconv.o ucodelib.a
${CC} -o $@ textconv.o ucodelib.a
${CC} $(CCflags) -o $@ textconv.o ucodelib.a
mkunictype: mkunictype.c
${HOST_CC} -o $@ $<
clean:
@-rm mkunictype textconv
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment