Added new file 'languages.h' with some ISO639 language codes.

Added a default language field to each encoding (using above codes). Added a max char size field to each encoding. Tidied up some of the reencoders behaviour when output ptr NULL. Fixed a load of charset numbers which were wrong. New UTF8 function to skiop multiple characters in a string. Fixed RISC OS build which was out of date. Version 0.04. Tagged as 'Unicode-0_04'

Added new file 'languages.h' with some ISO639 language codes.
Added a default language field to each encoding (using above codes). Added a max char size field to each encoding. Tidied up some of the reencoders behaviour when output ptr NULL. Fixed a load of charset numbers which were wrong. New UTF8 function to skiop multiple characters in a string. Fixed RISC OS build which was out of date. Version 0.04. Tagged as 'Unicode-0_04'
fa3fa475 · Simon Middleton · 1c323496 · fa3fa475 · fa3fa475 · fa3fa475
Commit fa3fa475 authored 27 years ago by Simon Middleton
15 changed files
--- a/Makefile
+++ b/Makefile
@@ -81,10 +81,7 @@ OBJS = autojp.o \
 	utf8.o \
 	encoding.o \
 	iso2022.o \
-	koi8r.o \
+	eightbit.o \
-	microsoft.o \
-	acorn.o \
-	apple.o \
 	shiftjis.o \
 	bigfive.o \
 	enc_utf8.o \
@@ -98,10 +95,7 @@ OBJSZ = \
 	oz.utf8 \
 	oz.encoding \
 	oz.iso2022 \
-	oz.koi8r \
+	oz.eightbit \
-	oz.microsoft \
-	oz.acorn \
-	oz.apple \
 	oz.shiftjis \
 	oz.bigfive \
 	oz.enc_utf8 \
@@ -115,10 +109,7 @@ OBJSD = \
 	od.utf8 \
 	od.encoding \
 	od.iso2022 \
-	od.koi8r \
+	od.eightbit \
-	od.microsoft \
-	od.acorn \
-	od.apple \
 	od.shiftjis \
 	od.bigfive \
 	od.enc_utf8 \
@@ -126,9 +117,9 @@ OBJSD = \
 	od.enc_utf16 \
 	od.enc_ucs4
-HDRS = autojp.h charsets.h encoding.h iso10646.h unictype.h utf8.h
+HDRS = autojp.h charsets.h encoding.h iso10646.h languages.h unictype.h utf8.h
-LIBS = ${TARGET} ${TARGETD} 
+LIBS = ${TARGET} ${TARGETD} ${TARGETZ} 
 #
 # Rule patterns
@@ -173,6 +164,7 @@ export_hdrs: ${HDRS} dirs
 	${CP} h.charsets ${UNICODELIBDIR}.h.* ${CPFLAGS}
 	${CP} h.encoding ${UNICODELIBDIR}.h.* ${CPFLAGS}
 	${CP} h.iso10646 ${UNICODELIBDIR}.h.* ${CPFLAGS}
+	${CP} h.languages ${UNICODELIBDIR}.h.* ${CPFLAGS}
 	${CP} h.unictype ${UNICODELIBDIR}.h.* ${CPFLAGS}
 	${CP} h.utf8     ${UNICODELIBDIR}.h.* ${CPFLAGS}
 	@echo ${COMPONENT}: export complete (hdrs)
@@ -180,9 +172,9 @@ export_hdrs: ${HDRS} dirs
 export_libs: ${LIBS} dirs
 	${CP} ${TARGET}  ${UNICODELIBDIR}.o.* ${CPFLAGS}
 	${CP} ${TARGETD} ${UNICODELIBDIR}.o.* ${CPFLAGS}
+	${CP} ${TARGETZ} ${UNICODELIBDIR}.o.* ${CPFLAGS}
 	@echo ${COMPONENT}: export complete (libs)
-#	${CP} ${TARGETZ} ${UNICODELIBDIR}.o.* ${CPFLAGS}
 local_dirs:
 	${MKDIR} o
@@ -202,7 +194,14 @@ unictype.c: mkunictype data.UNIDATA2
 mkunictype: mkunictype.o ${CLIB}
 	${LD} -o $@ mkunictype.o ${CLIB}
-unictype.o oz.unictype od.unictype: unictype.c
+unictype.o od.unictype oz.unictype: unictype.c
+links:
+	./mklinks
+	echo Made links
+#od.unictype: unictype.c
+#	${CC} ${CCFLAGS} -DROM=0 -DDEBUG=1         -o $@ unictype.c
 #
 # Final link

--- a/VersionNum
+++ b/VersionNum
-/* (0.03)
+/* (0.04)
 *
 * This file is automatically maintained by srccommit, do not edit manually.
 *
 */
-#define Module_MajorVersion_CMHG     	0.03
+#define Module_MajorVersion_CMHG     	0.04
 #define Module_MinorVersion_CMHG	
-#define Module_Date_CMHG      		12 Nov 1997
+#define Module_Date_CMHG      		21 Nov 1997
-#define Module_MajorVersion     	"0.03"
+#define Module_MajorVersion     	"0.04"
 #define Module_MinorVersion		""
-#define Module_Date      		"12 Nov 1997"
+#define Module_Date      		"21 Nov 1997"
--- a/c/enc_ucs4
+++ b/c/enc_ucs4
@@ -98,7 +98,7 @@ static int ucs4_write(EncodingPriv *e, UCS4 u, char **pucs4, int *bufsize)
 {
    char *ucs4;
-    if ((*bufsize -= 4) < 0)
+    if ((*bufsize -= 4) < 0 || !pucs4)
 	return 0;
    ucs4 = *pucs4;

--- a/c/enc_utf16
+++ b/c/enc_utf16
@@ -129,7 +129,7 @@ static int utf16_write(EncodingPriv *e, UCS4 u, char **putf16, int *bufsize)
 	c = 0xFFFD;
    }
-    if ((*bufsize -= (cc ? 4 : 2)) < 0)
+    if ((*bufsize -= (cc ? 4 : 2)) < 0 || !putf16)
 	return 0;
    *(*putf16)++ = (c >> 8) & 0xff;

--- a/c/enc_utf8
+++ b/c/enc_utf8
@@ -106,7 +106,7 @@ static int utf8_write(EncodingPriv *e, UCS4 u, char **utf8, int *bufsize)
 {
    int len = UTF8_codelen(u);
-    if ((*bufsize -= len) < 0)
+    if ((*bufsize -= len) < 0 || !utf8)
 	return 0;
    *utf8 = UCS4_to_UTF8(*utf8, u);

--- a/c/encoding
+++ b/c/encoding
@@ -28,6 +28,7 @@
 #include "enc_ucs4.h"
 #include "charsets.h"
+#include "languages.h"
 #include "VersionNum"
@@ -52,17 +53,17 @@ static char version[] = "Unicode library " Module_MajorVersion " " Module_Date "
 static EncList enclist[] =
 {
- {    3, "/US-ASCII/", (EncodingPriv *)ENC_ascii, NULL },
+ {   csASCII /* 3 */, 1, "/US-ASCII/", lang_ENGLISH, (EncodingPriv *)ENC_ascii, NULL },
- {    4, "/ISO-8859-1/ISO-IR-101/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x41\x1B\x2F\x50"	/* Select Latin-1 right half and G3 supplement */ },
+ {   csISOLatin1 /* 4 */, 1, "/ISO-8859-1/ISO-IR-101/", lang_ENGLISH, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x41\x1B\x2F\x50"	/* Select Latin-1 right half and G3 supplement */ },
- {    5, "/ISO-8859-2/ISO-IR-102/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x42\x1B\x2F\x50" },	/* Select Latin-2 right half */
+ {   csISOLatin2 /* 5 */, 1, "/ISO-8859-2/ISO-IR-102/", lang_ENGLISH, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x42\x1B\x2F\x50" },	/* Select Latin-2 right half */
- {    6, "/ISO-8859-3/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x43\x1B\x2F\x50" },		/* Select Latin-3 right half */
+ {   csISOLatin3 /* 6 */, 1, "/ISO-8859-3/", lang_ENGLISH, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x43\x1B\x2F\x50" },		/* Select Latin-3 right half */
- {    7, "/ISO-8859-4/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x44\x1B\x2F\x50" },		/* Select Latin-4 right half */
+ {   csISOLatin4 /* 7 */, 1, "/ISO-8859-4/", lang_ENGLISH, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x44\x1B\x2F\x50" },		/* Select Latin-4 right half */
- {    8, "/ISO-8859-5/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x4C" },			/* Select Cyrillic right half */
+ {   csISOLatinCyrillic /* 8 */, 1, "/ISO-8859-5/", lang_RUSSIAN, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x4C" },			/* Select Cyrillic right half */
- {   10, "/ISO-8859-7/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x46" },			/* Select Greek right half */
+ {   csISOLatinGreek /* 10 */, 1, "/ISO-8859-7/", lang_GREEK, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x46" },			/* Select Greek right half */
- {   11, "/ISO-8859-8/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x48" },			/* Select Hebrew right half */
+ {   csISOLatinHebrew /* 11 */, 1, "/ISO-8859-8/", lang_HEBREW, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x48" },			/* Select Hebrew right half */
- {   12, "/ISO-8859-9/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x4D\x1B\x2F\x50" },		/* Select Latin-5 right half */
+ {   csISOLatin5 /* 12 */, 1, "/ISO-8859-9/", lang_ENGLISH, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x4D\x1B\x2F\x50" },		/* Select Latin-5 right half */
- {   13, "/ISO-8859-10/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x56\x1B\x2E\x58" },		/* Select Latin-6 right half, and Sami supplement as G2 */
+ {   csISOLatin6 /* 13 */, 1, "/ISO-8859-10/", lang_ENGLISH, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x56\x1B\x2E\x58" },		/* Select Latin-6 right half, and Sami supplement as G2 */
- {   17, "/SHIFT_JIS/X-SJIS/", (EncodingPriv *)ENC_shiftjis },
+ {   csShiftJIS /* 17 */, 2, "/SHIFT_JIS/X-SJIS/", lang_JAPANESE, (EncodingPriv *)ENC_shiftjis },
     /*
     * Select G0 = JIS X 0201 Roman      (ESC ( J)
@@ -72,7 +73,7 @@ static EncList enclist[] =
     *        C1 = ISO 6429              (ESC " C)
     * Single shift range = GR           (ESC SP \)
     */
- {   18, "/EUC-JP/", (EncodingPriv *)ENC_iso2022_euc,
+ {   csEUCPkdFmtJapanese /* 18 */, 2, "/EUC-JP/", lang_JAPANESE, (EncodingPriv *)ENC_iso2022_euc,
 			      "\x1B\x28\x4A"
                          "\x1B\x24\x29\x42"
                              "\x1B\x2A\x49"
@@ -80,43 +81,41 @@ static EncList enclist[] =
                              "\x1B\x22\x43"
                              "\x1B\x20\x5C" },
- {   37, "/ISO-2022-KR/", (EncodingPriv *)ENC_iso2022_shifts, NULL,
+ {   csISO2022KR /* 37 */, 3, "/ISO-2022-KR/", lang_KOREAN, (EncodingPriv *)ENC_iso2022_shifts, NULL,
 		"\x1B\x24\x29\x43" },
- {   38, "/EUC-KR/", (EncodingPriv *)ENC_iso2022_euc,
+ {   csEUCKR  /* 38 */, 0, "/EUC-KR/", lang_KOREAN, (EncodingPriv *)ENC_iso2022_euc,
 		"\x1B\x24\x29\x43" },			/* Select G1 = KS C 5601 */
- {   39, "/ISO-2022-JP/JIS_Encoding/", (EncodingPriv *)ENC_iso2022_escapes, NULL,
+ {   csISO2022JP /* 39 */, 0, "/ISO-2022-JP/JIS_Encoding/", lang_JAPANESE, (EncodingPriv *)ENC_iso2022_escapes, NULL,
 		"\x1B\x28\x42"
 		"\x1B\x28\x4A"
 		"\x1B\x24\x40"
 		"\x1B\x24\x42" },
- {   40, "/ISO-2022-JP-2/", (EncodingPriv *)ENC_iso2022_escapes, NULL,
+ {   csISO2022JP2 /* 40 */, 0, "/ISO-2022-JP-2/", lang_JAPANESE, (EncodingPriv *)ENC_iso2022_escapes, NULL,
 		"\x1B\x28\x42"
 		"\x1B\x28\x4A"
 		"\x1B\x24\x40"
 		"\x1B\x24\x42"
                "\x1B\x24\x28\x43"
                "\x1B\x24\x28\x44" },
- {  104, "/ISO-2022-CN/", (EncodingPriv *)ENC_iso2022
+ { csISO2022CN /* 104 */, 0, "/ISO-2022-CN/", lang_CHINESE, (EncodingPriv *)ENC_iso2022 },
- },
+ { csISO2022CN_EXT /* 105 */, 0, "/ISO-2022-CN-EXT/", lang_CHINESE, (EncodingPriv *)ENC_iso2022 },
- {  105, "/ISO-2022-CN-EXT/", (EncodingPriv *)ENC_iso2022
+ { csUTF8 /* 106 */, 6, "/UTF-8/UNICODE-1-1-UTF-8/UNICODE-2-0-UTF-8/", lang_ENGLISH, (EncodingPriv *)ENC_utf8 },		/* More general!!! */
- },
+ { csUCS4 /* 1001 */, 4, "/ISO-10646-UCS-4/UCS-4/", lang_ENGLISH, (EncodingPriv *)ENC_ucs4 },
- {  106, "/UTF-8/UNICODE-1-1-UTF-8/UNICODE-2-0-UTF-8/", (EncodingPriv *)ENC_utf8 },		/* More general!!! */
+ { csUnicode11 /* 1010 */, 8, "/UCS-2/UTF-16/ISO-10646-UCS-2/UNICODE-1-1/UNICODE-2-0/", lang_ENGLISH, (EncodingPriv *)ENC_utf16 }, /* More general!!! */
- { 1001, "/ISO-10646-UCS-4/UCS-4/", (EncodingPriv *)ENC_ucs4 },
+ { 2022, 0, "/ISO-2022/", lang_ENGLISH, (EncodingPriv *)ENC_iso2022 },
- { 1010, "/UCS-2/UTF-16/ISO-10646-UCS-2/UNICODE-1-1/UNICODE-2-0/", (EncodingPriv *)ENC_utf16 }, /* More general!!! */
+ { csGB2312 /* 2025 */, 2, "/X-EUC_CN/GB2312/CN-GB/GB_2312-80", lang_CHINESE, (EncodingPriv *)ENC_iso2022_euc,
- { 2022, "/ISO-2022/", (EncodingPriv *)ENC_iso2022 },
- { 2025, "/X-EUC_CN/GB2312/CN-GB/GB_2312-80", (EncodingPriv *)ENC_iso2022_euc,
 		"\x1B\x24\x29\x41" },						/* Select G1 = GB 2312-80 */
- { 2026, "/CN-BIG5/BIG5/", (EncodingPriv *)ENC_bigfive },
+ { csBig5 /* 2026 */, 2, "/CN-BIG5/BIG5/", lang_CHINESE, (EncodingPriv *)ENC_bigfive },
- { 2027, "/MACINTOSH/", (EncodingPriv *)ENC_eightbit, "Apple.MacRoman" },
+ { csMacintosh /* 2027 */, 1, "/MACINTOSH/", lang_ENGLISH, (EncodingPriv *)ENC_eightbit, "Apple.MacRoman" },
- { 2084, "/KOI8-R/", (EncodingPriv *)ENC_eightbit, "KOI8-R" },
+ { csKOI8R /* 2084 */, 1, "/KOI8-R/", lang_RUSSIAN, (EncodingPriv *)ENC_eightbit, "KOI8-R" },
- { 2250, "/WINDOWS-1250/", (EncodingPriv *)ENC_eightbit, "Microsoft.CP1250" },
+ { csWindows1250 /* 2250 */, 1, "/WINDOWS-1250/", lang_ENGLISH, (EncodingPriv *)ENC_eightbit, "Microsoft.CP1250" },
- { 2252, "/WINDOWS-1252/", (EncodingPriv *)ENC_eightbit, "Microsoft.CP1252" },
+ { csWindows1252 /* 2252 */, 1, "/WINDOWS-1252/", lang_ENGLISH, (EncodingPriv *)ENC_eightbit, "Microsoft.CP1252" },
- { csWelsh, "/ISO-IR-182/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x5C" },			/* Select Welsh right half */
+ { csWelsh, 1, "/ISO-IR-182/", lang_WELSH, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x5C" },			/* Select Welsh right half */
 /*  { 4001, "/ISO-IR-179/", (EncodingPriv *)ENC_baltic_rim }, */
- { csSami, "/ISO-8859-15/ISO-IR-197/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x5D" },		/* Select Sami right half */
+ { csSami, 1, "/ISO-8859-15/ISO-IR-197/", lang_ENGLISH, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x5D" },		/* Select Sami right half */
- { csISOLatin13, "/ISO-8859-13/", (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x59" },		/* Select Baltic Rim right half */
+ { csISOLatin13, 1, "/ISO-8859-13/", lang_ENGLISH, (EncodingPriv *)ENC_iso8859, "\x1B\x2D\x59" },		/* Select Baltic Rim right half */
- { csAcornLatin1, "/X-ACORN-LATIN1/", (EncodingPriv *)ENC_eightbit, "Acorn.Latin1" },
+ { csAcornLatin1, 1, "/X-ACORN-LATIN1/", lang_ENGLISH, (EncodingPriv *)ENC_eightbit, "Acorn.Latin1" },
- { 0, NULL, NULL }
+ { 0, 0, NULL, NULL }
 };
 /* ----------------------------------------------------------------------------- */
@@ -176,16 +175,23 @@ static void fixup(void)
 /* ----------------------------------------------------------------------------- */
+static EncList *find_enclist(int n)
+{
+    EncList *e;
+    for (e = enclist; e->identifier; e++)
+	if (e->identifier == n)
+	    return e;
+    return NULL;
+}
 Encoding *encoding_new(int n, int for_encoding)
 {
-    struct EncList *e = enclist;
+    struct EncList *e;
    EncodingPriv *enc;
    fixup();
-    for (e = enclist; e->identifier; e++)
+    if ((e = find_enclist(n)) != NULL)
-    {
-        if (e->identifier == n)
    {
 	int size = sizeof(EncodingPriv) + e->encoding->ws_size;
 	enc = encoding__alloc(size);
@@ -205,7 +211,6 @@ Encoding *encoding_new(int n, int for_encoding)
 	return enc;
    }
-    }
    return NULL;
 }
@@ -253,6 +258,18 @@ int encoding_write(Encoding *e, UCS4 c, char **buf, int *bufsize)
    return 0;
 }
+int encoding_max_char_size(int enc_num)
+{
+    EncList *e = find_enclist(enc_num);
+    return e ? e->max_char_size : 0;
+}
+const char *encoding_default_language(int enc_num)
+{
+    EncList *e = find_enclist(enc_num);
+    return e ? e->lang : "en";
+}
 /* ----------------------------------------------------------------------------- */
 #define DEPTH_CUTOFF	3

--- a/c/iso2022
+++ b/c/iso2022
@@ -650,7 +650,7 @@ static int iso2022_write_escapes(EncodingPriv *e, UCS4 u, char **ps, int *bufsiz
    }
    /* check space */
-    if ((*bufsize -= out) < 0 || !*ps)
+    if ((*bufsize -= out) < 0 || !ps)
 	return 0;
    /* copy out and update ptr */

--- a/c/utf8
+++ b/c/utf8
@@ -234,6 +234,7 @@ int UTF8_seqlen(char c)
 */
 char *UTF8_next(const char *p)
 {
+    if (*p)
 	while ((*++p & 0xC0u) == 0x80u)
 	    continue;
@@ -250,3 +251,18 @@ char *UTF8_prev(const char *p)
    return (char *) p;
 }
+/*
+ * Skip forward the given number of UTF8 'characters'
+ */
+char *UTF8_next_n(const char *p, int n_chars)
+{
+    const char *s = p;
+    while (n_chars--)
+	s = UTF8_next(s);
+    return (char *)s;
+}
+/* eof utf8.c */
--- a/h/charsets
+++ b/h/charsets
@@ -42,7 +42,9 @@
 #define csEUCKR	38
 #define csISO2022JP	39
 #define csISO2022JP2	40
-#define csISO13JISC6220jp	104
+#define csISO2022CN	104
+#define csISO2022CN_EXT	105
+#define csISO13JISC6220jp	41
 #define csISO14JISC6220ro	42
 #define csISO15Italian	22
 #define csISO16Portuguese	43
@@ -188,7 +190,8 @@
 #define csKOI8R	2084
 #define csUnicode11	1010
 #define csUnicode11UTF7	103
-#define csJISEncoding	106
+#define csUTF8		106
+#define csJISEncoding	16
 #define csShiftJIS	17
 #define csEUCPkdFmtJapanese	18
 #define csEUCFixWidJapanese	19

--- a/h/encoding
+++ b/h/encoding
@@ -86,4 +86,7 @@ typedef void (*encoding_free_fn)(void *ptr);
 extern void encoding_set_alloc_fns(encoding_alloc_fn alloc, encoding_free_fn free);
+extern int encoding_max_char_size(int enc_num);
+extern const char *encoding_default_language(int enc_num);
 #endif
--- a/h/encpriv
+++ b/h/encpriv
@@ -35,7 +35,9 @@ typedef struct EncList EncList;
 struct EncList
 {
    int identifier;
+    int max_char_size;		/* maximum size of an encoded character in bytes, 0 means could be huge */
    char *names;
+    char *lang;			/* default language for this encoding */
    EncodingPriv *encoding;
    char *preload;
    char *encoder_data;

--- a/h/languages
+++ b/h/languages
+/* Copyright 1997 Acorn Computers Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* > languages.h
+ *
+ */
+#ifndef __unicode_languages_h
+#define __unicode_languages_h
+/* ISO 639 Language codes - a selection */
+#define lang_AFAR            "aa"
+#define lang_ABKHAZIAN       "ab"
+#define lang_AFRIKAANS       "af"
+#define lang_AMHARIC         "am"
+#define lang_ARABIC          "ar"
+#define lang_ASSAMESE        "as"
+#define lang_AYMARA          "ay"
+#define lang_AZERBAIJANI     "az"
+#define lang_BASHKIR         "ba"
+#define lang_BYELORUSSIAN    "be"
+#define lang_BULGARIAN       "bg"
+#define lang_BIHARI          "bh"
+#define lang_BISLAMA         "bi"
+#define lang_BENGALI	     "bn"
+#define lang_TIBETAN         "bo"
+#define lang_BRETON          "br"
+#define lang_CATALAN         "ca"
+#define lang_CORSICAN        "co"
+#define lang_CZECH           "cs"
+#define lang_WELSH           "cy"
+#define lang_DANISH          "da"
+#define lang_GERMAN          "de"
+#define lang_BHUTANI         "dz"
+#define lang_GREEK           "el"
+#define lang_ENGLISH         "en"
+#define lang_ESPERANTO       "eo"
+#define lang_SPANISH         "es"
+#define lang_ESTONIAN        "et"
+#define lang_BASQUE          "eu"
+#define lang_PERSIAN_FARSI   "pa"
+#define lang_FINNISH         "fi"
+#define lang_FIJI            "fj"
+#define lang_FAROESE         "fo"
+#define lang_FRENCH          "fr"
+#define lang_FRISIAN         "fy"
+#define lang_IRISH           "ga"
+#define lang_SCOTS_GAELIC    "gd"
+#define lang_GALICIAN        "gl"
+#define lang_GUARANI         "gn"
+#define lang_GUJARATI        "gu"
+#define lang_HAUSA           "ha"
+#define lang_HINDI           "hi"
+#define lang_CROATIAN        "hr"
+#define lang_HUNGARIAN       "hu"
+#define lang_ARMENIAN        "hy"
+#define lang_INTERLINGUA     "ia"
+#define lang_INTERLINGUE     "ie"
+#define lang_INUPIAK         "ik"
+#define lang_INDONESIAN      "in"
+#define lang_ICELANDIC       "is"
+#define lang_ITALIAN         "it"
+#define lang_HEBREW          "iw"
+#define lang_JAPANESE        "ja"
+#define lang_YIDDISH         "ji"
+#define lang_JAVANESE        "jv"
+#define lang_GEORGIAN        "ka"
+#define lang_KAZAKH          "kk"
+#define lang_GREENLANDIC     "kl"
+#define lang_CAMBODIAN       "km"
+#define lang_KANNADA         "kn"
+#define lang_KOREAN          "ko"
+#define lang_KASHMIRI        "ks"
+#define lang_KURDISH         "ku"
+#define lang_KIRGHIZ         "ky"
+#define lang_LATIN           "la"
+#define lang_LINGALA         "ln"
+#define lang_LAOTHIAN        "lo"
+#define lang_LITHUANIAN      "lt"
+#define lang_LATVIAN	     "lv"
+#define lang_MALAGASY        "mg"
+#define lang_MAORI           "mi"
+#define lang_MACEDONIAN      "mk"
+#define lang_MALAYALAM       "ml"
+#define lang_MONGOLIAN       "mn"
+#define lang_MOLDAVIAN       "mo"
+#define lang_MARATHI         "mr"
+#define lang_MALAY           "ms"
+#define lang_MALTESE         "mt"
+#define lang_BURMESE         "my"
+#define lang_NAURU           "na"
+#define lang_NEPALI          "ne"
+#define lang_DUTCH           "nl"
+#define lang_NORWEGIAN       "no"
+#define lang_OCCITAN         "oc"
+#define lang_AFAN	     "om"
+#define lang_ORIYA           "or"
+#define lang_PUNJABI         "pa"
+#define lang_POLISH          "pl"
+#define lang_PASHTO	     "ps"
+#define lang_PORTUGUESE      "pt"
+#define lang_QUECHUA         "qu"
+#define lang_RHAETO_ROMANCE  "rm"
+#define lang_KURUNDI         "rn"
+#define lang_ROMANIAN        "ro"
+#define lang_RUSSIAN         "ru"
+#define lang_KINYARWANDA     "rw"
+#define lang_SANSKRIT        "sa"
+#define lang_SINDHI          "sd"
+#define lang_SANGHO          "sg"
+#define lang_SERBO_CROATIAN  "sh"
+#define lang_SINGHALESE      "si"
+#define lang_SLOVAK          "sk"
+#define lang_SLOVENIAN       "sl"
+#define lang_SAMOAN          "sm"
+#define lang_SHONA           "sn"
+#define lang_SOMALI          "so"
+#define lang_ALBANIAN        "sq"
+#define lang_SERBIAN         "sr"
+#define lang_SISWATI         "ss"
+#define lang_SESOTHO         "st"
+#define lang_SUNDANESE       "su"
+#define lang_SWEDISH         "sv"
+#define lang_SWAHILI         "sw"
+#define lang_TAMIL           "ta"
+#define lang_TELUGU          "te"
+#define lang_TAJIK           "tg"
+#define lang_THAI            "th"
+#define lang_TIGRINYA        "ti"
+#define lang_TURKMEN         "tk"
+#define lang_TAGALOG         "tl"
+#define lang_SETSWANA        "tn"
+#define lang_TONGA           "to"
+#define lang_TURKISH         "tr"
+#define lang_TSONGA          "ts"
+#define lang_TATAR           "tt"
+#define lang_TWI             "tw"
+#define lang_UKRAINIAN       "uk"
+#define lang_URDU            "ur"
+#define lang_UZBEK           "uz"
+#define lang_VIETNAMESE      "vi"
+#define lang_VOLAPUK         "vo"
+#define lang_WOLOF           "wo"
+#define lang_XHOSA           "xh"
+#define lang_YORUBA          "yo"
+#define lang_CHINESE         "zh"
+#define lang_ZULU            "zu"
+#endif
+/* eof languages.h */
--- a/h/utf8
+++ b/h/utf8
@@ -55,13 +55,20 @@ extern int UTF8_to_UCS4(const char *in, UCS4 *code_out);
 * Note that p + UTF8_seqlen(p[0]) != UTF8_next(p) if p points to the
 * middle of a sequence.
 */
-char *UTF8_next(const char *p);
+extern char *UTF8_next(const char *p);
 /*
 * UTF8_prev reverses the pointer to the previous UTF-8 code in a string.
 * If p points to the middle of a UTF-8 sequence, it will be reversed
 * to the start of that UTF-8 sequence.
 */
-char *UTF8_prev(const char *p);
+extern char *UTF8_prev(const char *p);
+/*
+ * UTF8_next_n advances the pointer over 'n' UTF8 characters and returns the
+ * new pointer. It uses UTF8_next so has its behaviour over starting conditions.
+ */
+extern char *UTF8_next_n(const char *p, int n_chars);
 #endif
--- a/unix/Makefile
+++ b/unix/Makefile
@@ -93,6 +93,7 @@ riscos_export: riscos_libs
 	@cp -p unictype.h $(RISCOS_BUILD_EXPORT)/Lib/Unicode/
 	@cp -p encoding.h $(RISCOS_BUILD_EXPORT)/Lib/Unicode/
 	@cp -p iso10646.h $(RISCOS_BUILD_EXPORT)/Lib/Unicode/
+	@cp -p languages.h $(RISCOS_BUILD_EXPORT)/Lib/Unicode/
 	@cp -p utf8.h $(RISCOS_BUILD_EXPORT)/Lib/Unicode/
 	@echo Copied libraries
@@ -100,6 +101,9 @@ clean:
 	@-rm mkunictype
 	@-rm unictype.c
 	@-rm *.o
+	@-rm *.od
+	@-rm *.oz
+	@-rm ucodelib ucodelibd ucodelibz
 	echo Done clean
 realclean: clean

--- a/unix/mklinks
+++ b/unix/mklinks
@@ -21,6 +21,7 @@ wantlink ../c/unictype unictype.c
 wantlink ../h/unictype unictype.h
 wantlink ../c/mkunictype mkunictype.c
 wantlink ../h/charsets charsets.h
+wantlink ../h/languages languages.h
 wantlink ../h/encpriv encpriv.h
 wantlink ../h/iso10646 iso10646.h