utf8 2.34 KB
Newer Older
Simon Middleton's avatar
Simon Middleton committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
/* Copyright 1997 Acorn Computers Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include "iso10646.h"

#ifndef __utf8_h
#define __utf8_h

/*
 * UCS4_to_UTF8 converts a UCS-4 code to UTF-8, storing the result
 * in the array pointed to by out. This array must be large enough
 * to store the resulting UTF-8 element (6 bytes will always be
 * sufficient). The return value is a pointer to the byte after
 * the last one written. Assumes code is a legal UCS value (ie
 * in the range 0-0x7FFFFFFF).
 */
extern char *UCS4_to_UTF8(char *out, UCS4 code);

/*
 * UTF8_codelen gives the length of a UCS-4 code when converted to UTF-8
 * using the above function.
 */
extern int UTF8_codelen(UCS4 code);

/*
 * UTF8_seqlen gives the length of a UTF-8 sequence, given its first
 * byte.
 */
extern int UTF8_seqlen(char c);

/*
 * UTF8_to_UCS4 takes a pointer to a UTF-8 sequence and outputs
 * the corresponding UCS4 code, returning the number of bytes consumed.
 * Incorrect sequences are dealt with by returning 1 and setting
 * *code_out to 0xFFFD.
 */
extern int UTF8_to_UCS4(const char *in, UCS4 *code_out);

/*
 * UTF8_next advances the pointer to the next UTF-8 code in a string.
 * If p points to the middle of a UTF-8 sequence, it will be advanced
 * to the next UTF-8 sequence.
 *
 * Note that p + UTF8_seqlen(p[0]) != UTF8_next(p) if p points to the
 * middle of a sequence.
 */
58
extern char *UTF8_next(const char *p);
Simon Middleton's avatar
Simon Middleton committed
59 60 61 62 63 64

/*
 * UTF8_prev reverses the pointer to the previous UTF-8 code in a string.
 * If p points to the middle of a UTF-8 sequence, it will be reversed
 * to the start of that UTF-8 sequence.
 */
65 66 67 68 69 70 71 72
extern char *UTF8_prev(const char *p);

/*
 * UTF8_next_n advances the pointer over 'n' UTF8 characters and returns the
 * new pointer. It uses UTF8_next so has its behaviour over starting conditions.
 */

extern char *UTF8_next_n(const char *p, int n_chars);
Simon Middleton's avatar
Simon Middleton committed
73 74

#endif