jcharset.h
11.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
/***************************************************************************
* Copyright (C) 2005 by <still unknown> <modified version> *
* root@sat *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/
#ifndef J_CHARSET_H
#define J_CHARSET_H
#include "jobject.h"
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
typedef uint32_t UTF32; // at least 32 bits
typedef uint16_t UTF16; // at least 16 bits
typedef uint8_t UTF8; // typically 8 bits
typedef uint32_t Char; // 0 or 1
// Some fundamental constants
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
#define UNI_MAX_BMP (UTF32)0x0000FFFF
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
enum jcharset_result_t {
JCR_OK, // conversion successful
JCR_SOURCE_EXHAUSTED, // partial character in source, but hit end
JCR_TARGET_EXHAUSTED, // insuff. room in target for conversion
JCR_SOURCE_ILLEGAL, // source sequence is illegal/malformed
JCR_SOURCE_CORRUPT // source was corrupted
};
enum jcharset_flags_t {
JCF_STRICT_CONVERSION = 0,
JCF_LENIENT_CONVERSION,
};
enum {
Low6Bits = 0x3F, // 00111111
High2Bits = 0xC0, // 11000000
ByteMask = 0x00BF, // 10111111
ContinueBits = 0x80 // 10xxxxxx
};
namespace jcommon {
/**
* \brief UTF-8 is a way of reading and writing Unicode 32-bit characters
* to ordinary 8-bit communications streams.
*
* The UTF-8 algorithm stores characters into variable-sized
* chunks. Characters in the range 0x00 to 0x7F fit into one
* byte, since these will be quite common (ASCII values).
* Characters with higher values fit into two, three, four,
* five, or six bytes, depending on the number of significant
* bits, according to the following pattern:
*
* Bits Pattern
* ---- -------
* 7 0xxxxxxx
* 11 110xxxxx 10xxxxxx
* 16 1110xxxx 10xxxxxx 10xxxxxx
* 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 32 111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* As can be seen from the table, at most 32 bits can be stored
* using this algorithm (the x's mark where the actual bits go,
* the numbers signify the padding bits). The padding "10" at
* the start of a byte uniquely identifies a continuation byte,
* which is never used as the start of a UTF-8 character sequence,
* so if a stream is broken for some reason, the algorithm can
* skip those bytes to find the next start of a character.
*
* ASCII is a 7-bit encoding for the English language alphabet
* and various digits and symbols. Its values range from 0x00 to 0x7F.
*
* A superset of ASCII is ISO-Latin-1 (code page 8859-1). This is
* an 8-bit encoding for Western European languages, with values
* in the range 0x00 to 0xFF. The lower half of this range is
* the same as ASCII, while the upper half includes many accented
* characters.
*
* Unicode is a superset of ISO-Latin-1, which mostly fits into
* 16-bits, but which is actually a 32-bit encoding for most
* language symbols on Earth, including Eastern European, African,
* Asian, and many other languages. It allows a single document
* to contain mixtures of all languages.
*
* This file contains functions for reading and writing Unicode
* and ISO-Latin-1 streams, to and from an array of 32-bit
* Unicode values in memory. Each 32-bit value is called a Char.
*/
class Charset : public virtual jcommon::Object{
private:
public:
/**
* \brief
*
*/
Charset();
/**
* \brief
*
*/
virtual ~Charset();
/**
* \brief
*
*/
jcharset_result_t UnicodeToUTF8(const Char **src_start, const Char *src_end, char **dst_start, const char *dst_end);
/**
* \brief
*
*/
jcharset_result_t UTF8ToUnicode(const char **src_start, const char *src_end, Char **dst_start, const Char *dst_end);
/*
* \brief Convert a single Unicode character into a UTF-8 byte buffer.
* The buffer should be at least 7 bytes wide.
*/
int UnicodeToUTF8(Char ch, char *utf8);
/*
* \brief Convert a single UTF-8 character into a Unicode value.
*/
Char UTF8ToUnicode(const char *utf8);
/*
* \brief Return the number of Unicode characters within a UTF-8 string.
* For ASCII strings this will return the same number as strlen.
*/
int UTF8Length(const char *utf8);
/*
* \brief Read a Unicode value from a UTF-8 file. The file must be open in
* binary mode to read. Errors are reported in the return value, as a charset_result_t.
*/
int ReadUTF8(FILE *f, Char * dst);
/*
* \brief Read a file into a UTF-8 char array, up to and including
* the 'stop' character (or an EOF will end input).
* This function returns the alloc'd UTF-8 encoded string.
* The number of bytes in the returned string is placed in *nbytes.
* The number of characters in the returned string is placed in *nchars.
* If EOF is encountered immediately, the function returns NULL.
* If the 'stop' character is EOF, this function reads the
* entire file.
*/
char * ReadLatin1File(FILE *f, int *nbytes, int *nchars, int stop);
/**
* \brief
*
*/
char * ReadUTF8File(FILE *f, int *nbytes, int *nchars, int stop);
/*
* \brief Read an entire file into a memory char array. Return NULL if the file is empty.
*
*/
char * ReadLatin1Buffer(FILE *f, int *nbytes, int *nchars);
/**
* \brief
*
*/
char * ReadUTF8Buffer(FILE *f, int *nbytes, int *nchars);
/*
* \brief A function for reading one line of input from a file.
* This function returns the alloc'd string, and any
* terminating newline character is included in the line.
* The length of the returned string is placed into *length.
* If EOF is encountered immediately, the function returns NULL.
* If EOF is encountered before a newline character, the string
* is returned without any terminating newline.
* Otherwise, a newline character will be the last character
* in the char array.
*/
char * ReadLatin1Line(FILE *f, int *nbytes, int *nchars);
/**
* \brief
*
*/
char * ReadUTF8Line(FILE *f, int *nbytes, int *nchars);
/*
* \brief Write a UTF-8 char array to a file as ISO Latin 1 (non ISO-Latin-1 characters
* will be distorted by this process). Assume the UTF-8 char array is correct.
*/
int WriteLatin1(FILE *f, const char *utf8, int nbytes);
/*
* \brief Write a UTF-8 char array to a file as UTF-8. Assume the UTF-8 char array is correct.
*
*/
int WriteUTF8(FILE *f, const char *utf8, int nbytes);
/*
* Convert a UTF-8 char array to an ISO Latin 1 char array (non ISO Latin 1 characters
* will be distorted by this process). Assume the UTF-8 char array is correct.
* This function creates a new string containing the ISO Latin 1 data. It returns NULL
* if it runs out of memory.
*/
char * UTF8ToLatin1(const char *utf8, int *bytes);
/*
* \brief Convert a (possibly ISO Latin 1) char array to a UTF-8 char array, as best we can.
* If it is already correctly UTF 8 encoded, return the input string unchanged. This function
* may create a new string containing the UTF-8 data. It returns NULL if it runs out of memory.
*/
char * CorrectUTF8(const char *s, int *bytes);
/*
* \brief Return non-zero (true) if the given UTF-8 char array contains
* only ASCII characters, otherwise return zero.
*/
int IsASCII(const char *utf8, int nbytes);
/*
* \brief Return non-zero (true) if the given UTF-8 char array contains
* only ASCII and ISO Latin-1 characters, otherwise return zero.
*/
int IsLatin1(const char *utf8, int nbytes);
/**
* \brief
*
*/
jcharset_result_t ConvertUTF32ToUTF16(const UTF32** sourceStart, const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd, jcharset_flags_t flags);
/**
* \brief
*
*/
jcharset_result_t ConvertUTF16ToUTF32(const UTF16** sourceStart, const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd, jcharset_flags_t flags);
/*
* \brief The interface converts a whole buffer to avoid function-call overhead.
* Constants have been gathered. Loops & conditionals have been removed as
* much as possible for efficiency, in favor of drop-through switches.
* (See "Note A" at the bottom of the file for equivalent code.)
* If your compiler supports it, the "isLegalUTF8" call can be turned
* into an inline function.
*/
jcharset_result_t ConvertUTF16ToUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, jcharset_flags_t flags);
/*
* \brief Utility routine to tell whether a sequence of bytes is legal UTF-8.
* This must be called with the length pre-determined by the first byte.
* If not calling this from ConvertUTF8to*, then the length can be set by:
* length = trailingBytesForUTF8[*source]+1;
* and the sequence is illegal right away if there aren't that many bytes available.
* If presented with a length > 4, this returns false. The Unicode
* definition of UTF-8 goes up to 4-byte sequences.
*/
bool IsLegalUTF8(const UTF8 *source, int length);
/*
* \brief Exported function to return whether a UTF-8 sequence is legal or not.
* This is not used here; it's just exported.
*
*/
bool IsLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
/**
* \brief
*
*/
jcharset_result_t ConvertUTF8ToUTF16(const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd, jcharset_flags_t flags);
/**
* \brief
*
*/
jcharset_result_t ConvertUTF32ToUTF8(const UTF32** sourceStart, const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd, jcharset_flags_t flags);
/**
* \brief
*
*/
jcharset_result_t ConvertUTF8ToUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, jcharset_flags_t flags);
};
}
#endif