/* Copyright 1998 Acorn Computers Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include "Unicode/iso10646.h" typedef struct charinfo { UCS4 code; char *name; } charinfo; int numchars=0; static charinfo charlist[0x10000]; void error(const char *p); static void parse_line(char *s) { char *sp = s, *dp; UCS4 code; char name[256]; while (*sp != ';' && *sp != '\0') sp++; *sp='\0'; code = (UCS4) strtol(s, NULL, 16); sp++; dp = name; while (*sp != ';' && *sp != '\0') *dp++=*sp++; *dp='\0'; if (name[0]=='<') return; sp = malloc(dp+1-name); if (sp) { charlist[numchars].code=code; charlist[numchars++].name=sp; strcpy(sp, name); } } static int cmp_name(const void *p1, const void *p2) { charinfo *c1 = (charinfo *) p1; charinfo *c2 = (charinfo *) p2; /* printf("Comparing %04X(%04X) to %04X(%04X)\n", c1-charlist, c1->code, c2-charlist, c2->code); if (c1->name == NULL) { if (c2->name == NULL) return 0; else return +1; } else if (c2->name == NULL) return -1;*/ return strcmp(c1->name, c2->name); } static int cmp_name2(const void *p1, const void *p2) { char *n = (char *) p1; charinfo *c2 = (charinfo *) p2; return strcmp(n, c2->name); } void load_unidata(const char *filename) { FILE *f; char buffer[512]; f = fopen(filename, "r"); if (!f) { perror("fopen"); exit(1); } while (fgets(buffer, 512, f) != NULL) { if (strlen(buffer)==0 || buffer[0]==';') continue; parse_line(buffer); } fclose(f); charlist[numchars].code=0xFFFFFFFF; charlist[numchars++].name="-"; qsort(charlist, numchars, sizeof charlist[0], cmp_name); } static UCS4 get_choseong(const char **p) { static const char list[19][3] = { "G", "GG", "N", "D", "DD", "L", "M", "B", "BB", "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H" }; int i; for (i=18; i>=0; i--) { if (list[i][0]=='\0') continue; if (strncmp(*p, list[i], strlen(list[i]))==0) { (*p)+=strlen(list[i]); return i+0x1100; } } return 0x110B; } static UCS4 get_jungseong(const char **p) { static const char list[21][4] = { "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE" , "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I" }; int i; int l=0; int c=-1; for (i=0; i<21; i++) { if (strncmp(*p, list[i], strlen(list[i]))==0) { if (strlen(list[i]) > l) { l = strlen(list[i]); c = i; } } } if (c != -1) { (*p)+=l; return c+0x1161; } error("Bad jungseong"); return NULL_UCS4; } static UCS4 get_jongseong(const char **p) { static const char list[27][3] = { "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H" }; int i; if (**p == '\0') return 0; for (i=26; i>=0; i--) { if (strncmp(*p, list[i], strlen(list[i]))==0) { (*p)+=strlen(list[i]); return i+0x11A8; } } error("Bad jungseong"); return NULL_UCS4; } static UCS4 hangul_syllable_code(const char *p) { UCS4 L, V, T; L=get_choseong(&p); V=get_jungseong(&p); T=get_jongseong(&p); if (T==0) T=0x11A7; if (*p != '\0') { error("Bad hangul syllable"); return NULL_UCS4; } return ((L-0x1100)*21 + (V-0x1161))*28 + (T-0x11A7) + 0xAC00; } static UCS4 extract_code(const char *p) { char *end; UCS4 c = (UCS4) strtol(p, &end, 16); if (*end != '\0') { error("Bad hex"); return NULL_UCS4; } return c; } UCS4 UCS_from_name(const char *name) { charinfo *ci; if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22)==0) { UCS4 c=extract_code(name+22); if (c<0x4E00 || c>0x9FA5) error("Bad ideograph"); return c; } if (strncmp(name, "HANGUL SYLLABLE ", 16)==0) return hangul_syllable_code(name+16); if (strncmp(name, "U+", 2)==0) { UCS4 c=extract_code(name+2); if (c > 0x7FFFFFFF) error("Bad UCS"); return c; } ci = (charinfo *) bsearch(name, charlist, numchars, sizeof charlist[0], cmp_name2); if (!ci) { error("Unknown character"); return NULL_UCS4; } return ci->code; } const char *name_from_UCS(UCS4 u) { char buffer[256]; int i; if (u >= 0x4E00 && u <= 0x9FA5) { sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", u); return buffer; } else if (u >= 0xAC00 && u <= 0xD7A3) { /* Hangul syllable - too hard */ sprintf(buffer, "U+%X\n", u); return buffer; } for (i=0; i<numchars; i++) if (charlist[i].code == u) return charlist[i].name; sprintf(buffer, "U+%X\n", u); return buffer; } #if 0 void error(const char *p) { fputs(p, stderr); fputc('\n', stderr); //exit(1); } int main() { //int i=0; char buffer[512]; load_unidata("UniData215"); /*while (charlist[i].code != 0xFFFFFFFF) { printf("%50s %08X\n", charlist[i].name, charlist[i].code); i++; } */ for (;;) { gets(buffer); printf("%08X\n", UCS_from_name(buffer)); } return 0; } #endif