Commit 407bccff authored by Simon Middleton's avatar Simon Middleton
Browse files

Fixed autojp state machine. It wasn't resetting 'state' to HAD_NONE after...

Fixed autojp state machine. It wasn't resetting 'state' to HAD_NONE after changing whatcode. So basically it was lucky it ever worked. Also rewrote the various range tests to only use one compare per case.

Changed the 'for_encoding' parameter to encoding_write() to an enumeration.
Added a new type of writing where if the character cannot be encoded then
the function returns -1 rather than writing a default character
Added the pseudo-charsets csAutodetectJP and csEUCorShiftJIS to the encoding
table so that they return the correct default language (ja).
Added function to remove unused encoding tables (must be called explicitly).
Fixed usage counting in iso2022 (I think).
When looking up encoding name try stripping 'x-' and 'X-' off the front i
can't find on first pass.

Version 0.12. Tagged as 'Unicode-0_12'
parent 10298658
/* (0.10) /* (0.12)
* *
* This file is automatically maintained by srccommit, do not edit manually. * This file is automatically maintained by srccommit, do not edit manually.
* *
*/ */
#define Module_MajorVersion_CMHG 0.10 #define Module_MajorVersion_CMHG 0.12
#define Module_MinorVersion_CMHG #define Module_MinorVersion_CMHG
#define Module_Date_CMHG 18 Dec 1997 #define Module_Date_CMHG 05 Jan 1998
#define Module_MajorVersion "0.10" #define Module_MajorVersion "0.12"
#define Module_MinorVersion "" #define Module_MinorVersion ""
#define Module_Date "18 Dec 1997" #define Module_Date "05 Jan 1998"
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
#include "charsets.h" #include "charsets.h"
#include "autojp.h" #include "autojp.h"
#include <stdio.h> /* for debugging */
#define NEW csISO2022JP /* Normal JIS */ #define NEW csISO2022JP /* Normal JIS */
#define OLD csISO2022JP /* Obselete JIS variant */ #define OLD csISO2022JP /* Obselete JIS variant */
/* #define NEC 3 Unknown variant */ /* #define NEC 3 Unknown variant */
...@@ -45,6 +47,22 @@ ...@@ -45,6 +47,22 @@
#define ESC 27 #define ESC 27
#define SS2 142 #define SS2 142
//#define DBG DEBUG
#define DBG 0
#if DBG
static char *states[] =
{
"HAD_NONE",
"HAD_ESC",
"HAD_ESC_DOLLAR",
"HAD_SS2",
"HAD_161_223",
"HAD_224_239_A",
"HAD_224_239_B"
};
#endif
/* /*
#define SJIS1(A) ((A >= 129 && A <= 159) || (A >= 224 && A <= 239)) #define SJIS1(A) ((A >= 129 && A <= 159) || (A >= 224 && A <= 239))
#define SJIS2(A) (A >= 64 && A <= 252) #define SJIS2(A) (A >= 64 && A <= 252)
...@@ -73,6 +91,10 @@ int autojp_consume(int *pencoding, int *pstate, int c) ...@@ -73,6 +91,10 @@ int autojp_consume(int *pencoding, int *pstate, int c)
if (whatcode != ASCII && whatcode != EUCORSJIS) if (whatcode != ASCII && whatcode != EUCORSJIS)
return autojp_DECIDED; return autojp_DECIDED;
#if DBG
fprintf(stderr, "autojp_consume: %02x (%c) ", c, c < 32 ? '.' : c);
#endif
switch (state) switch (state)
{ {
case HAD_ESC: case HAD_ESC:
...@@ -91,6 +113,7 @@ int autojp_consume(int *pencoding, int *pstate, int c) ...@@ -91,6 +113,7 @@ int autojp_consume(int *pencoding, int *pstate, int c)
break; break;
case HAD_ESC_DOLLAR: case HAD_ESC_DOLLAR:
state = HAD_NONE;
switch (c) switch (c)
{ {
case 'B': case 'B':
...@@ -99,54 +122,57 @@ int autojp_consume(int *pencoding, int *pstate, int c) ...@@ -99,54 +122,57 @@ int autojp_consume(int *pencoding, int *pstate, int c)
case '@': case '@':
whatcode = OLD; whatcode = OLD;
break; break;
default:
state = HAD_NONE;
break;
} }
break; break;
case HAD_SS2: case HAD_SS2:
if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160) || (c >= 224 && c <= 252)) state = HAD_NONE;
if (c < 64 || c == 127)
;
else if (c <= 160)
whatcode = SJIS; whatcode = SJIS;
else if (c >= 161 && c <= 223) else if (c <= 223)
whatcode = EUCORSJIS; whatcode = EUCORSJIS;
else else if (c <= 252)
state = HAD_NONE; whatcode = SJIS;
break; break;
case HAD_161_223: case HAD_161_223:
if (c >= 240 && c <= 254) state = HAD_NONE;
whatcode = EUC; if (c <= 159)
else if (c >= 161 && c <= 223) whatcode = SJIS;
else if (c == 160)
;
else if (c <= 223)
whatcode = EUCORSJIS; whatcode = EUCORSJIS;
else if (c >= 224 && c <= 239) else if (c <= 239)
state = HAD_224_239_B; state = HAD_224_239_B;
else if (c <= 159) else if (c <= 254)
whatcode = SJIS; whatcode = EUC;
else
state = HAD_NONE;
break; break;
case HAD_224_239_A: case HAD_224_239_A:
if (c < 129) state = HAD_NONE;
if (c < 129 || c == 142)
whatcode = EUCORSJIS; whatcode = EUCORSJIS;
else if (c <= 141 || (c >= 143 && c <= 159)) else if (c <= 159)
whatcode = SJIS; whatcode = SJIS;
else if (c >= 253 && c <= 254) else if (c == 253 || c == 254)
whatcode = EUC; whatcode = EUC;
else else
whatcode = EUCORSJIS; whatcode = EUCORSJIS;
break; break;
case HAD_224_239_B: case HAD_224_239_B:
if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160)) state = HAD_NONE;
if (c < 64 || c == 127)
;
else if (c <= 160)
whatcode = SJIS; whatcode = SJIS;
else if (c >= 253 && c <= 254) else if (c <= 252)
whatcode = EUC;
else if (c >= 161 && c <= 252)
whatcode = EUCORSJIS; whatcode = EUCORSJIS;
else else if (c <= 254)
state = HAD_NONE; whatcode = EUC;
break; break;
case HAD_NONE: case HAD_NONE:
...@@ -154,19 +180,27 @@ int autojp_consume(int *pencoding, int *pstate, int c) ...@@ -154,19 +180,27 @@ int autojp_consume(int *pencoding, int *pstate, int c)
state = HAD_ESC; state = HAD_ESC;
else if (c <= 128) else if (c <= 128)
; /* no new decision */ ; /* no new decision */
else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159)) else if (c <= 141)
whatcode = SJIS; whatcode = SJIS;
else if (c == SS2) else if (c == SS2)
state = HAD_SS2; state = HAD_SS2;
else if (c >= 161 && c <= 223) else if (c <= 159)
whatcode = SJIS;
else if (c == 160)
;
else if (c <= 223)
state = HAD_161_223; state = HAD_161_223;
else if (c >= 240 && c <= 254) else if (c <= 239)
whatcode = EUC;
else if (c >= 224 && c <= 239)
state = HAD_224_239_A; state = HAD_224_239_A;
else if (c <= 254)
whatcode = EUC;
break; break;
} }
#if DBG
fprintf(stderr, "enc %d '%s'\n", whatcode, states[state]);
#endif
*pencoding = whatcode; *pencoding = whatcode;
*pstate = state; *pstate = state;
......
...@@ -152,6 +152,8 @@ static int bigfive_write(EncodingPriv *e, UCS4 u, char **bf, int *bufsize) ...@@ -152,6 +152,8 @@ static int bigfive_write(EncodingPriv *e, UCS4 u, char **bf, int *bufsize)
c = 0x80; c = 0x80;
else if (u == 0x2026) /* Ellipsis - a Mac extension */ else if (u == 0x2026) /* Ellipsis - a Mac extension */
c = 0xff; c = 0xff;
else if (e->for_encoding == encoding_WRITE_STRICT)
return -1;
else /* if (u == 0xFFFD) */ /* bad character */ else /* if (u == 0xFFFD) */ /* bad character */
c = '?'; c = '?';
......
...@@ -73,6 +73,8 @@ int eightbit_write(EncodingPriv *e, UCS4 u, char **s, int *bufsize) ...@@ -73,6 +73,8 @@ int eightbit_write(EncodingPriv *e, UCS4 u, char **s, int *bufsize)
c = u; c = u;
else if ((i = encoding_lookup_in_table(u, ee->table)) != -1) else if ((i = encoding_lookup_in_table(u, ee->table)) != -1)
c = i + 0x80; c = i + 0x80;
else if (e->for_encoding == encoding_WRITE_STRICT)
return -1;
else else
c = '?'; c = '?';
......
...@@ -63,6 +63,8 @@ static int ascii_write(EncodingPriv *e, UCS4 u, char **s, int *bufsize) ...@@ -63,6 +63,8 @@ static int ascii_write(EncodingPriv *e, UCS4 u, char **s, int *bufsize)
if (u < 0x80) if (u < 0x80)
c = u; c = u;
else if (e->for_encoding == encoding_WRITE_STRICT)
return -1;
else else
c = '?'; c = '?';
......
...@@ -123,6 +123,8 @@ static EncList enclist[] = ...@@ -123,6 +123,8 @@ static EncList enclist[] =
{ csISOLatin13, 1, "/ISO-8859-13/", lang_ENGLISH, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x59" }, /* Select Baltic Rim right half */ { csISOLatin13, 1, "/ISO-8859-13/", lang_ENGLISH, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x59" }, /* Select Baltic Rim right half */
{ csAcornLatin1, 1, "/X-ACORN-LATIN1/", lang_ENGLISH, (EncodingPriv *)ENC_eightbit, "Acorn.Latin1" }, { csAcornLatin1, 1, "/X-ACORN-LATIN1/", lang_ENGLISH, (EncodingPriv *)ENC_eightbit, "Acorn.Latin1" },
{ csAcornFuzzy, 1, "/X-ACORN-FUZZY/", lang_ENGLISH, (EncodingPriv *)ENC_acorn, "Acorn.Latin1" }, { csAcornFuzzy, 1, "/X-ACORN-FUZZY/", lang_ENGLISH, (EncodingPriv *)ENC_acorn, "Acorn.Latin1" },
{ csAutodetectJP, 0, "//", lang_JAPANESE, NULL, NULL }, /* entry so we can get language */
{ csEUCorShiftJIS, 0, "//", lang_JAPANESE, NULL, NULL }, /* entry so we can get language */
{ 0, 0, NULL, NULL } { 0, 0, NULL, NULL }
}; };
...@@ -253,7 +255,7 @@ unsigned encoding_read(Encoding *e, encoding_read_callback_fn ucs_out, ...@@ -253,7 +255,7 @@ unsigned encoding_read(Encoding *e, encoding_read_callback_fn ucs_out,
unsigned int n, unsigned int n,
void *handle) void *handle)
{ {
if (e && !e->for_encoding) if (e && e->for_encoding == encoding_READ)
{ {
return e->read(e, ucs_out, s, n, handle); return e->read(e, ucs_out, s, n, handle);
} }
...@@ -262,7 +264,7 @@ unsigned encoding_read(Encoding *e, encoding_read_callback_fn ucs_out, ...@@ -262,7 +264,7 @@ unsigned encoding_read(Encoding *e, encoding_read_callback_fn ucs_out,
int encoding_write(Encoding *e, UCS4 c, char **buf, int *bufsize) int encoding_write(Encoding *e, UCS4 c, char **buf, int *bufsize)
{ {
if (e && e->for_encoding) if (e && e->for_encoding != encoding_READ)
{ {
return e->write(e, c, buf, bufsize); return e->write(e, c, buf, bufsize);
} }
...@@ -278,7 +280,7 @@ int encoding_max_char_size(int enc_num) ...@@ -278,7 +280,7 @@ int encoding_max_char_size(int enc_num)
const char *encoding_default_language(int enc_num) const char *encoding_default_language(int enc_num)
{ {
EncList *e = find_enclist(enc_num); EncList *e = find_enclist(enc_num);
return e ? e->lang : "en"; return e ? e->lang : lang_ANY;
} }
/* ----------------------------------------------------------------------------- */ /* ----------------------------------------------------------------------------- */
...@@ -332,6 +334,32 @@ int encoding_lookup_in_table(UCS4 u, encoding_table t) ...@@ -332,6 +334,32 @@ int encoding_lookup_in_table(UCS4 u, encoding_table t)
return -1; return -1;
} }
static void encoding_table_free(encoding_table t)
{
if (t)
{
if (t->alloc)
encoding__free(t->table);
encoding__free(t);
}
}
void encoding_table_remove_unused(void)
{
encoding_table t, prev;
for (prev = NULL, t = enc_table_list;
t;
prev = t, t = t->next)
{
if (t->usage <= 0)
{
if (prev)
prev->next = t->next;
encoding_table_free(t);
}
}
}
static encoding_table look_for_table(const char *name, encoding_table *pprev) static encoding_table look_for_table(const char *name, encoding_table *pprev)
{ {
encoding_table t, prev; encoding_table t, prev;
...@@ -375,7 +403,7 @@ static int mime_token_char(UCS4 c) ...@@ -375,7 +403,7 @@ static int mime_token_char(UCS4 c)
} }
} }
int encoding_number_from_name(const char *name) static int encoding__number_from_name(const char *name)
{ {
const char *p = name; const char *p = name;
char *temp, *tp; char *temp, *tp;
...@@ -406,6 +434,20 @@ int encoding_number_from_name(const char *name) ...@@ -406,6 +434,20 @@ int encoding_number_from_name(const char *name)
return e->identifier; return e->identifier;
} }
int encoding_number_from_name(const char *name)
{
int e = encoding__number_from_name(name);
if (e == 0)
{
/* try stripping any 'x-' prefix from the name */
if ((name[0] == 'x' || name[0] == 'X') && name[1] == '-')
e = encoding__number_from_name(name + 2);
}
return e;
}
/* To be supplied by the application */ /* To be supplied by the application */
/* extern void encoding_leaf_to_path(char *out, const char *leaf); */ /* extern void encoding_leaf_to_path(char *out, const char *leaf); */
...@@ -528,7 +570,7 @@ encoding_table encoding_load_map_file(const char *leaf) ...@@ -528,7 +570,7 @@ encoding_table encoding_load_map_file(const char *leaf)
void encoding_discard_map_file(encoding_table t) void encoding_discard_map_file(encoding_table t)
{ {
t->alloc--; if (t) t->usage--;
} }
encoding_alloc_fn encoding__alloc = malloc; encoding_alloc_fn encoding__alloc = malloc;
......
...@@ -235,6 +235,16 @@ encoding_table iso2022_find_table(int type, int id) ...@@ -235,6 +235,16 @@ encoding_table iso2022_find_table(int type, int id)
return encoding_load_map_file(fname); return encoding_load_map_file(fname);
} }
static void set_free(ISO2022_Set *Set)
{
simple_set *set = (simple_set *)Set;
if (set)
{
encoding_discard_map_file(set->table);
encoding__free(set);
}
}
static int iso2022_select_set(ISO2022_Encoding *i, int setno, int type, int id) static int iso2022_select_set(ISO2022_Encoding *i, int setno, int type, int id)
{ {
encoding_table t; encoding_table t;
...@@ -276,7 +286,7 @@ static int iso2022_select_set(ISO2022_Encoding *i, int setno, int type, int id) ...@@ -276,7 +286,7 @@ static int iso2022_select_set(ISO2022_Encoding *i, int setno, int type, int id)
/* fdebugf(stderr, "iso2022_select_set: free %p new %p tempset %d oldset %p\n", i->Set[setno], set, i->tempset, i->oldset); */ /* fdebugf(stderr, "iso2022_select_set: free %p new %p tempset %d oldset %p\n", i->Set[setno], set, i->tempset, i->oldset); */
encoding__free(i->Set[setno]); set_free(i->Set[setno]);
i->Set[setno] = set; i->Set[setno] = set;
if (i->CL_s == setno) i->CL = i->Set[setno]; if (i->CL_s == setno) i->CL = i->Set[setno];
...@@ -290,12 +300,12 @@ static int iso2022_select_set(ISO2022_Encoding *i, int setno, int type, int id) ...@@ -290,12 +300,12 @@ static int iso2022_select_set(ISO2022_Encoding *i, int setno, int type, int id)
static void iso2022_delete(EncodingPriv *e) static void iso2022_delete(EncodingPriv *e)
{ {
ISO2022_Encoding *i = (ISO2022_Encoding *) e; ISO2022_Encoding *i = (ISO2022_Encoding *) e;
encoding__free(i->Set[G0]); set_free(i->Set[G0]);
encoding__free(i->Set[G1]); set_free(i->Set[G1]);
encoding__free(i->Set[G2]); set_free(i->Set[G2]);
encoding__free(i->Set[G3]); set_free(i->Set[G3]);
encoding__free(i->Set[C0]); set_free(i->Set[C0]);
encoding__free(i->Set[C1]); set_free(i->Set[C1]);
} }
static int iso2022_reset(Encoding *e, int for_encoding) static int iso2022_reset(Encoding *e, int for_encoding)
...@@ -332,7 +342,7 @@ static int iso2022_reset(Encoding *e, int for_encoding) ...@@ -332,7 +342,7 @@ static int iso2022_reset(Encoding *e, int for_encoding)
return 0; return 0;
} }
if (for_encoding) if (for_encoding != encoding_READ)
{ {
const char *s = e->list_entry->encoder_data; const char *s = e->list_entry->encoder_data;
if (s) if (s)
...@@ -349,7 +359,11 @@ static int iso2022_reset(Encoding *e, int for_encoding) ...@@ -349,7 +359,11 @@ static int iso2022_reset(Encoding *e, int for_encoding)
{ {
if (last_s) if (last_s)
{ {
i->table[tab].table = ((simple_set *)i->Set[G0])->table; simple_set *ss = (simple_set *)i->Set[G0];
i->table[tab].table = ss->table; /* copy over table ptr */
ss->table = NULL; /* null entry so it doesn't get discarded */
i->table[tab].esc_seq = last_s; i->table[tab].esc_seq = last_s;
i->table[tab].esc_seq_len = s - last_s; i->table[tab].esc_seq_len = s - last_s;
...@@ -371,8 +385,7 @@ static int iso2022_reset(Encoding *e, int for_encoding) ...@@ -371,8 +385,7 @@ static int iso2022_reset(Encoding *e, int for_encoding)
s++; s++;
} }
/* free the set used in G0 */ set_free(i->Set[G0]);
encoding__free(i->Set[G0]);
i->Set[G0] = 0; i->Set[G0] = 0;
} }
...@@ -654,6 +667,9 @@ static int iso2022_write_escapes(EncodingPriv *e, UCS4 u, char **ps, int *bufsiz ...@@ -654,6 +667,9 @@ static int iso2022_write_escapes(EncodingPriv *e, UCS4 u, char **ps, int *bufsiz
out += write_index(index, buf + out, n_entries, 1); out += write_index(index, buf + out, n_entries, 1);
} }
else if (e->for_encoding == encoding_WRITE_STRICT)
return -1;
/* special chars */ /* special chars */
else /* if (u == 0xFFFD) */ /* bad character */ else /* if (u == 0xFFFD) */ /* bad character */
buf[out++] = '?'; buf[out++] = '?';
...@@ -772,6 +788,9 @@ static int iso2022_write_euc(EncodingPriv *e, UCS4 u, char **euc, int *bufsize) ...@@ -772,6 +788,9 @@ static int iso2022_write_euc(EncodingPriv *e, UCS4 u, char **euc, int *bufsize)
} }
} }
else if (e->for_encoding == encoding_WRITE_STRICT)
return -1;
/* special chars */ /* special chars */
else /* if (u == 0xFFFD) */ /* bad character */ else /* if (u == 0xFFFD) */ /* bad character */
buf[out++] = '?'; buf[out++] = '?';
......
...@@ -243,6 +243,9 @@ static int shiftjis_write(EncodingPriv *e, UCS4 u, char **sjis, int *bufsize) ...@@ -243,6 +243,9 @@ static int shiftjis_write(EncodingPriv *e, UCS4 u, char **sjis, int *bufsize)
else if (u == 0x2026) else if (u == 0x2026)
c = 0xFF; /* Ellipsis - a Mac extension */ c = 0xFF; /* Ellipsis - a Mac extension */
else if (e->for_encoding == encoding_WRITE_STRICT)
return -1;
else /* if (u == 0xFFFD) */ /* bad character */ else /* if (u == 0xFFFD) */ /* bad character */
c = '?'; c = '?';
......
...@@ -49,7 +49,15 @@ extern int encoding_write(Encoding *e, UCS4 c, char **buf, int *bufsize); ...@@ -49,7 +49,15 @@ extern int encoding_write(Encoding *e, UCS4 c, char **buf, int *bufsize);
* To obtain a new decoder session, use the following calls. * To obtain a new decoder session, use the following calls.
* *
* new_encoding takes an encoding number. These numbers are Internet MIB numbers. * new_encoding takes an encoding number. These numbers are Internet MIB numbers.
* for_encoding is one of four values describing whether to the encoding will be used
* with 'read' or 'write' methods and how writing should handle encoding errors.
*/ */
#define encoding_READ 0 /* to UTF 8 */
#define encoding_WRITE 1 /* from UTF8, single alternate for unencodable characters */
#define encoding_WRITE_STRICT 2 /* from UTF8 and return -1 if unencodable character */
#define encoding_WRITE_LOOSE 3 /* from UTF8, alternate list for unencodable characters */
extern Encoding *encoding_new(int n, int for_encoding); extern Encoding *encoding_new(int n, int for_encoding);
/* /*
...@@ -58,12 +66,12 @@ extern Encoding *encoding_new(int n, int for_encoding); ...@@ -58,12 +66,12 @@ extern Encoding *encoding_new(int n, int for_encoding);
*/ */
extern void encoding_delete(Encoding *e); extern void encoding_delete(Encoding *e);
/* /*
* reset_decoder() resets the decoder to its default state. Some encodings * reset_decoder() resets the decoder to its default state. Some encodings
* have state (eg ISO 2022's character set and UCS-2's byte ordering) - this * have state (eg ISO 2022's character set and UCS-2's byte ordering) - this
* will reset to allow a stream to be rescanned or a new stream to be * will reset to allow a stream to be rescanned or a new stream to be
* decoded. * decoded.
*/ */
extern int encoding_reset(Encoding *e); extern int encoding_reset(Encoding *e);
...@@ -87,6 +95,18 @@ typedef void (*encoding_free_fn)(void *ptr); ...@@ -87,6 +95,18 @@ typedef void (*encoding_free_fn)(void *ptr);
extern void encoding_set_alloc_fns(encoding_alloc_fn alloc, encoding_free_fn free); extern void encoding_set_alloc_fns(encoding_alloc_fn alloc, encoding_free_fn free);
extern int encoding_max_char_size(int enc_num); extern int encoding_max_char_size(int enc_num);
/*
* Return the default language string (see languages.h) for this encoding.
* Generic encodings such as UTF8 return ""
*/
extern const char *encoding_default_language(int enc_num); extern const char *encoding_default_language(int enc_num);
/*
* Scan the list of loaded tables and free any that have zero usage counr.
*/
extern void encoding_table_remove_unused(void);
#endif #endif
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
#define lang_INDONESIAN "in" #define lang_INDONESIAN "in"
#define lang_ICELANDIC "is" #define lang_ICELANDIC "is"
#define lang_ITALIAN "it" #define lang_ITALIAN "it"
#define lang_HEBREW "iw" #define lang_HEBREW "he" /* used to be iw until changed */
#define lang_JAPANESE "ja" #define lang_JAPANESE "ja"
#define lang_YIDDISH "ji" #define lang_YIDDISH "ji"
#define lang_JAVANESE "jv" #define lang_JAVANESE "jv"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment