FLTK 1.3.9
Loading...
Searching...
No Matches
utf8Utils.c
1/* "$Id: $"
2 *
3 * Author: Jean-Marc Lienher ( http://oksid.ch )
4 * Copyright 2000-2003 by O'ksi'D.
5 *
6 * This library is free software. Distribution and use rights are outlined in
7 * the file "COPYING" which should have been included with this file. If this
8 * file is missing or damaged, see the license at:
9 *
10 * http://www.fltk.org/COPYING.php
11 *
12 * Please report all bugs and problems on the following page:
13 *
14 * http://www.fltk.org/str.php
15 */
16
17/*
18 * Unicode to UTF-8 conversion functions.
19 */
20
21#if !defined(WIN32) && !defined(__APPLE__)
22
23#include "../Xutf8.h"
24
25/*** NOTE : all functions are LIMITED to 24 bits Unicode values !!! ***/
26
27/*
28 * Converts the first char of the UTF-8 string to an Unicode value
29 * Returns the byte length of the converted UTF-8 char
30 * Returns -1 if the UTF-8 string is not valid
31 */
32int
33XConvertUtf8ToUcs(const unsigned char *buf,
34 int len,
35 unsigned int *ucs) {
36
37 if (buf[0] & 0x80) {
38 if (buf[0] & 0x40) {
39 if (buf[0] & 0x20) {
40 if (buf[0] & 0x10) {
41 if (buf[0] & 0x08) {
42 if (buf[0] & 0x04) {
43 if (buf[0] & 0x02) {
44 /* bad UTF-8 string */
45 } else {
46 /* 0x04000000 - 0x7FFFFFFF */
47 }
48 } else if (len > 4
49 && (buf[1] & 0xC0) == 0x80
50 && (buf[2] & 0xC0) == 0x80
51 && (buf[3] & 0xC0) == 0x80
52 && (buf[4] & 0xC0) == 0x80) {
53 /* 0x00200000 - 0x03FFFFFF */
54 *ucs = ((buf[0] & ~0xF8) << 24) +
55 ((buf[1] & ~0x80) << 18) +
56 ((buf[2] & ~0x80) << 12) +
57 ((buf[3] & ~0x80) << 6) +
58 (buf[4] & ~0x80);
59 if (*ucs > 0x001FFFFF && *ucs < 0x01000000) return 5;
60 }
61 } else if (len > 3
62 && (buf[1] & 0xC0) == 0x80
63 && (buf[2] & 0xC0) == 0x80
64 && (buf[3] & 0xC0) == 0x80) {
65 /* 0x00010000 - 0x001FFFFF */
66 *ucs = ((buf[0] & ~0xF0) << 18) +
67 ((buf[1] & ~0x80) << 12) +
68 ((buf[2] & ~0x80) << 6) +
69 (buf[3] & ~0x80);
70 if (*ucs > 0x0000FFFF) return 4;
71 }
72 } else if (len > 2
73 && (buf[1] & 0xC0) == 0x80
74 && (buf[2] & 0xC0) == 0x80) {
75 /* 0x00000800 - 0x0000FFFF */
76 *ucs = ((buf[0] & ~0xE0) << 12) +
77 ((buf[1] & ~0x80) << 6) +
78 (buf[2] & ~0x80);
79 if (*ucs > 0x000007FF) return 3;
80 }
81 } else if (len > 1 && (buf[1] & 0xC0) == 0x80) {
82 /* 0x00000080 - 0x000007FF */
83 *ucs = ((buf[0] & ~0xC0) << 6) +
84 (buf[1] & ~0x80);
85 if (*ucs > 0x0000007F) return 2;
86 }
87 }
88 } else if (len > 0) {
89 /* 0x00000000 - 0x0000007F */
90 *ucs = buf[0];
91 return 1;
92 }
93
94 *ucs = (unsigned int) '?'; /* bad utf-8 string */
95 return -1;
96}
97
98/*
99 * Converts an Unicode value to an UTF-8 string
100 * NOTE : the buffer (buf) must be at least 5 bytes long !!!
101 */
102int
103XConvertUcsToUtf8(unsigned int ucs,
104 char *buf) {
105
106 if (ucs < 0x000080) {
107 buf[0] = ucs;
108 return 1;
109 } else if (ucs < 0x000800) {
110 buf[0] = 0xC0 | (ucs >> 6);
111 buf[1] = 0x80 | (ucs & 0x3F);
112 return 2;
113 } else if (ucs < 0x010000) {
114 buf[0] = 0xE0 | (ucs >> 12);
115 buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
116 buf[2] = 0x80 | (ucs & 0x3F);
117 return 3;
118 } else if (ucs < 0x00200000) {
119 buf[0] = 0xF0 | (ucs >> 18);
120 buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
121 buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
122 buf[3] = 0x80 | (ucs & 0x3F);
123 return 4;
124 } else if (ucs < 0x01000000) {
125 buf[0] = 0xF8 | (ucs >> 24);
126 buf[1] = 0x80 | ((ucs >> 18) & 0x3F);
127 buf[2] = 0x80 | ((ucs >> 12) & 0x3F);
128 buf[3] = 0x80 | ((ucs >> 6) & 0x3F);
129 buf[4] = 0x80 | (ucs & 0x3F);
130 return 5;
131 }
132 buf[0] = '?';
133 return -1;
134}
135
136/*
137 * returns the byte length of the first UTF-8 char
138 * (returns -1 if not valid)
139 */
140int
141XUtf8CharByteLen(const unsigned char *buf,
142 int len) {
143 unsigned int ucs;
144 return XConvertUtf8ToUcs(buf, len, &ucs);
145}
146
147/*
148 * returns the quantity of Unicode chars in the UTF-8 string
149 */
150int
151XCountUtf8Char(const unsigned char *buf,
152 int len) {
153
154 int i = 0;
155 int nbc = 0;
156 while (i < len) {
157 int cl = XUtf8CharByteLen(buf + i, len - i);
158 if (cl < 1) cl = 1;
159 nbc++;
160 i += cl;
161 }
162 return nbc;
163}
164
165/*
166 * Same as XConvertUtf8ToUcs but no sanity check is done.
167 */
168int
169XFastConvertUtf8ToUcs(const unsigned char *buf,
170 int len,
171 unsigned int *ucs) {
172
173 if (buf[0] & 0x80) {
174 if (buf[0] & 0x40) {
175 if (buf[0] & 0x20) {
176 if (buf[0] & 0x10) {
177 if (buf[0] & 0x08) {
178 if (buf[0] & 0x04) {
179 if (buf[0] & 0x02) {
180 /* bad UTF-8 string */
181 } else {
182 /* 0x04000000 - 0x7FFFFFFF */
183 }
184 } else if (len > 4) {
185 /* 0x00200000 - 0x03FFFFFF */
186 *ucs = ((buf[0] & ~0xF8) << 24) +
187 ((buf[1] & ~0x80) << 18) +
188 ((buf[2] & ~0x80) << 12) +
189 ((buf[3] & ~0x80) << 6) +
190 (buf[4] & ~0x80);
191 return 5;
192 }
193 } else if (len > 3) {
194 /* 0x00010000 - 0x001FFFFF */
195 *ucs = ((buf[0] & ~0xF0) << 18) +
196 ((buf[1] & ~0x80) << 12) +
197 ((buf[2] & ~0x80) << 6) +
198 (buf[3] & ~0x80);
199 return 4;
200 }
201 } else if (len > 2) {
202 /* 0x00000800 - 0x0000FFFF */
203 *ucs = ((buf[0] & ~0xE0) << 12) +
204 ((buf[1] & ~0x80) << 6) +
205 (buf[2] & ~0x80);
206 return 3;
207 }
208 } else if (len > 1) {
209 /* 0x00000080 - 0x000007FF */
210 *ucs = ((buf[0] & ~0xC0) << 6) +
211 (buf[1] & ~0x80);
212 return 2;
213 }
214 }
215 } else if (len > 0) {
216 /* 0x00000000 - 0x0000007F */
217 *ucs = buf[0];
218 return 1;
219 }
220
221 *ucs = (unsigned int) '?'; /* bad utf-8 string */
222 return -1;
223}
224
225#endif /* X11 only */
226
227/*
228 * End of "$Id: $".
229 */