FLTK 1.4.0
Loading...
Searching...
No Matches
utf8Utils.c
1/*
2 * Author: Jean-Marc Lienher ( http://oksid.ch )
3 * Copyright 2000-2003 by O'ksi'D.
4 *
5 * This library is free software. Distribution and use rights are outlined in
6 * the file "COPYING" which should have been included with this file. If this
7 * file is missing or damaged, see the license at:
8 *
9 * https://www.fltk.org/COPYING.php
10 *
11 * Please see the following page on how to report bugs and issues:
12 *
13 * https://www.fltk.org/bugs.php
14 */
15
16/*
17 * Unicode to UTF-8 conversion functions.
18 *
19 * This file is compiled and linked only for X11 w/o Xft.
20 */
21
22#include "../Xutf8.h"
23
24/*** NOTE : all functions are LIMITED to 24 bits Unicode values !!! ***/
25
26/*
27 * Converts the first char of the UTF-8 string to an Unicode value
28 * Returns the byte length of the converted UTF-8 char
29 * Returns -1 if the UTF-8 string is not valid
30 */
31int
32XConvertUtf8ToUcs(const unsigned char *buf,
33 int len,
34 unsigned int *ucs) {
35
36 if (buf[0] & 0x80) {
37 if (buf[0] & 0x40) {
38 if (buf[0] & 0x20) {
39 if (buf[0] & 0x10) {
40 if (buf[0] & 0x08) {
41 if (buf[0] & 0x04) {
42 if (buf[0] & 0x02) {
43 /* bad UTF-8 string */
44 } else {
45 /* 0x04000000 - 0x7FFFFFFF */
46 }
47 } else if (len > 4
48 && (buf[1] & 0xC0) == 0x80
49 && (buf[2] & 0xC0) == 0x80
50 && (buf[3] & 0xC0) == 0x80
51 && (buf[4] & 0xC0) == 0x80) {
52 /* 0x00200000 - 0x03FFFFFF */
53 *ucs = ((buf[0] & ~0xF8) << 24) +
54 ((buf[1] & ~0x80) << 18) +
55 ((buf[2] & ~0x80) << 12) +
56 ((buf[3] & ~0x80) << 6) +
57 (buf[4] & ~0x80);
58 if (*ucs > 0x001FFFFF && *ucs < 0x01000000) return 5;
59 }
60 } else if (len > 3
61 && (buf[1] & 0xC0) == 0x80
62 && (buf[2] & 0xC0) == 0x80
63 && (buf[3] & 0xC0) == 0x80) {
64 /* 0x00010000 - 0x001FFFFF */
65 *ucs = ((buf[0] & ~0xF0) << 18) +
66 ((buf[1] & ~0x80) << 12) +
67 ((buf[2] & ~0x80) << 6) +
68 (buf[3] & ~0x80);
69 if (*ucs > 0x0000FFFF) return 4;
70 }
71 } else if (len > 2
72 && (buf[1] & 0xC0) == 0x80
73 && (buf[2] & 0xC0) == 0x80) {
74 /* 0x00000800 - 0x0000FFFF */
75 *ucs = ((buf[0] & ~0xE0) << 12) +
76 ((buf[1] & ~0x80) << 6) +
77 (buf[2] & ~0x80);
78 if (*ucs > 0x000007FF) return 3;
79 }
80 } else if (len > 1 && (buf[1] & 0xC0) == 0x80) {
81 /* 0x00000080 - 0x000007FF */
82 *ucs = ((buf[0] & ~0xC0) << 6) +
83 (buf[1] & ~0x80);
84 if (*ucs > 0x0000007F) return 2;
85 }
86 }
87 } else if (len > 0) {
88 /* 0x00000000 - 0x0000007F */
89 *ucs = buf[0];
90 return 1;
91 }
92
93 *ucs = (unsigned int) '?'; /* bad UTF-8 string */
94 return -1;
95}
96
97/*
98 * Converts an Unicode value to an UTF-8 string
99 * NOTE : the buffer (buf) must be at least 5 bytes long !!!
100 */
101int
102XConvertUcsToUtf8(unsigned int ucs,
103 char *buf) {
104
105 if (ucs < 0x000080) {
106 buf[0] = ucs;
107 return 1;
108 } else if (ucs < 0x000800) {
109 buf[0] = 0xC0 | (ucs >> 6);
110 buf[1] = 0x80 | (ucs & 0x3F);
111 return 2;
112 } else if (ucs < 0x010000) {
113 buf[0] = 0xE0 | (ucs >> 12);
114 buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
115 buf[2] = 0x80 | (ucs & 0x3F);
116 return 3;
117 } else if (ucs < 0x00200000) {
118 buf[0] = 0xF0 | (ucs >> 18);
119 buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
120 buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
121 buf[3] = 0x80 | (ucs & 0x3F);
122 return 4;
123 } else if (ucs < 0x01000000) {
124 buf[0] = 0xF8 | (ucs >> 24);
125 buf[1] = 0x80 | ((ucs >> 18) & 0x3F);
126 buf[2] = 0x80 | ((ucs >> 12) & 0x3F);
127 buf[3] = 0x80 | ((ucs >> 6) & 0x3F);
128 buf[4] = 0x80 | (ucs & 0x3F);
129 return 5;
130 }
131 buf[0] = '?';
132 return -1;
133}
134
135/*
136 * returns the byte length of the first UTF-8 char
137 * (returns -1 if not valid)
138 */
139int
140XUtf8CharByteLen(const unsigned char *buf,
141 int len) {
142 unsigned int ucs;
143 return XConvertUtf8ToUcs(buf, len, &ucs);
144}
145
146/*
147 * returns the quantity of Unicode chars in the UTF-8 string
148 */
149int
150XCountUtf8Char(const unsigned char *buf,
151 int len) {
152
153 int i = 0;
154 int nbc = 0;
155 while (i < len) {
156 int cl = XUtf8CharByteLen(buf + i, len - i);
157 if (cl < 1) cl = 1;
158 nbc++;
159 i += cl;
160 }
161 return nbc;
162}
163
164/*
165 * Same as XConvertUtf8ToUcs but no sanity check is done.
166 */
167int
168XFastConvertUtf8ToUcs(const unsigned char *buf,
169 int len,
170 unsigned int *ucs) {
171
172 if (buf[0] & 0x80) {
173 if (buf[0] & 0x40) {
174 if (buf[0] & 0x20) {
175 if (buf[0] & 0x10) {
176 if (buf[0] & 0x08) {
177 if (buf[0] & 0x04) {
178 if (buf[0] & 0x02) {
179 /* bad UTF-8 string */
180 } else {
181 /* 0x04000000 - 0x7FFFFFFF */
182 }
183 } else if (len > 4) {
184 /* 0x00200000 - 0x03FFFFFF */
185 *ucs = ((buf[0] & ~0xF8) << 24) +
186 ((buf[1] & ~0x80) << 18) +
187 ((buf[2] & ~0x80) << 12) +
188 ((buf[3] & ~0x80) << 6) +
189 (buf[4] & ~0x80);
190 return 5;
191 }
192 } else if (len > 3) {
193 /* 0x00010000 - 0x001FFFFF */
194 *ucs = ((buf[0] & ~0xF0) << 18) +
195 ((buf[1] & ~0x80) << 12) +
196 ((buf[2] & ~0x80) << 6) +
197 (buf[3] & ~0x80);
198 return 4;
199 }
200 } else if (len > 2) {
201 /* 0x00000800 - 0x0000FFFF */
202 *ucs = ((buf[0] & ~0xE0) << 12) +
203 ((buf[1] & ~0x80) << 6) +
204 (buf[2] & ~0x80);
205 return 3;
206 }
207 } else if (len > 1) {
208 /* 0x00000080 - 0x000007FF */
209 *ucs = ((buf[0] & ~0xC0) << 6) +
210 (buf[1] & ~0x80);
211 return 2;
212 }
213 }
214 } else if (len > 0) {
215 /* 0x00000000 - 0x0000007F */
216 *ucs = buf[0];
217 return 1;
218 }
219
220 *ucs = (unsigned int) '?'; /* bad UTF-8 string */
221 return -1;
222}