comparison src/libid3tag/utf8.c @ 2503:10692383c103 trunk

[svn] first try for libid3tag integration. this improved libid3tag supports vfs operations and is capable of adding id3v2 tag to files which doesn't have id3v2 tag ever.
author yaz
date Sun, 11 Feb 2007 05:19:07 -0800
parents
children
comparison
equal deleted inserted replaced
2502:b7be0af74307 2503:10692383c103
1 /*
2 * libid3tag - ID3 tag manipulation library
3 * Copyright (C) 2000-2004 Underbit Technologies, Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * $Id: utf8.c,v 1.9 2004/01/23 09:41:32 rob Exp $
20 */
21
22 # ifdef HAVE_CONFIG_H
23 # include "config.h"
24 # endif
25
26 # include "global.h"
27
28 # include <stdlib.h>
29
30 # include "id3tag.h"
31 # include "utf8.h"
32 # include "ucs4.h"
33
34 /*
35 * NAME: utf8->length()
36 * DESCRIPTION: return the number of ucs4 chars represented by a utf8 string
37 */
38 id3_length_t id3_utf8_length(id3_utf8_t const *utf8)
39 {
40 id3_length_t length = 0;
41
42 while (*utf8) {
43 if ((utf8[0] & 0x80) == 0x00)
44 ++length;
45 else if ((utf8[0] & 0xe0) == 0xc0 &&
46 (utf8[1] & 0xc0) == 0x80) {
47 if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) {
48 ++length;
49 utf8 += 1;
50 }
51 }
52 else if ((utf8[0] & 0xf0) == 0xe0 &&
53 (utf8[1] & 0xc0) == 0x80 &&
54 (utf8[2] & 0xc0) == 0x80) {
55 if ((((utf8[0] & 0x0fL) << 12) |
56 ((utf8[1] & 0x3fL) << 6)) >= 0x00000800L) {
57 ++length;
58 utf8 += 2;
59 }
60 }
61 else if ((utf8[0] & 0xf8) == 0xf0 &&
62 (utf8[1] & 0xc0) == 0x80 &&
63 (utf8[2] & 0xc0) == 0x80 &&
64 (utf8[3] & 0xc0) == 0x80) {
65 if ((((utf8[0] & 0x07L) << 18) |
66 ((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) {
67 ++length;
68 utf8 += 3;
69 }
70 }
71 else if ((utf8[0] & 0xfc) == 0xf8 &&
72 (utf8[1] & 0xc0) == 0x80 &&
73 (utf8[2] & 0xc0) == 0x80 &&
74 (utf8[3] & 0xc0) == 0x80 &&
75 (utf8[4] & 0xc0) == 0x80) {
76 if ((((utf8[0] & 0x03L) << 24) |
77 ((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) {
78 ++length;
79 utf8 += 4;
80 }
81 }
82 else if ((utf8[0] & 0xfe) == 0xfc &&
83 (utf8[1] & 0xc0) == 0x80 &&
84 (utf8[2] & 0xc0) == 0x80 &&
85 (utf8[3] & 0xc0) == 0x80 &&
86 (utf8[4] & 0xc0) == 0x80 &&
87 (utf8[5] & 0xc0) == 0x80) {
88 if ((((utf8[0] & 0x01L) << 30) |
89 ((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) {
90 ++length;
91 utf8 += 5;
92 }
93 }
94
95 ++utf8;
96 }
97
98 return length;
99 }
100
101 /*
102 * NAME: utf8->size()
103 * DESCRIPTION: return the encoding size of a utf8 string
104 */
105 id3_length_t id3_utf8_size(id3_utf8_t const *utf8)
106 {
107 id3_utf8_t const *ptr = utf8;
108
109 while (*ptr)
110 ++ptr;
111
112 return ptr - utf8 + 1;
113 }
114
115 /*
116 * NAME: utf8->ucs4duplicate()
117 * DESCRIPTION: duplicate and decode a utf8 string into ucs4
118 */
119 id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8)
120 {
121 id3_ucs4_t *ucs4;
122
123 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
124 if (ucs4)
125 id3_utf8_decode(utf8, ucs4);
126
127 return release(ucs4);
128 }
129
130 /*
131 * NAME: utf8->decodechar()
132 * DESCRIPTION: decode a series of utf8 chars into a single ucs4 char
133 */
134 id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
135 {
136 id3_utf8_t const *start = utf8;
137
138 while (1) {
139 if ((utf8[0] & 0x80) == 0x00) {
140 *ucs4 = utf8[0];
141 return utf8 - start + 1;
142 }
143 else if ((utf8[0] & 0xe0) == 0xc0 &&
144 (utf8[1] & 0xc0) == 0x80) {
145 *ucs4 =
146 ((utf8[0] & 0x1fL) << 6) |
147 ((utf8[1] & 0x3fL) << 0);
148 if (*ucs4 >= 0x00000080L)
149 return utf8 - start + 2;
150 }
151 else if ((utf8[0] & 0xf0) == 0xe0 &&
152 (utf8[1] & 0xc0) == 0x80 &&
153 (utf8[2] & 0xc0) == 0x80) {
154 *ucs4 =
155 ((utf8[0] & 0x0fL) << 12) |
156 ((utf8[1] & 0x3fL) << 6) |
157 ((utf8[2] & 0x3fL) << 0);
158 if (*ucs4 >= 0x00000800L)
159 return utf8 - start + 3;
160 }
161 else if ((utf8[0] & 0xf8) == 0xf0 &&
162 (utf8[1] & 0xc0) == 0x80 &&
163 (utf8[2] & 0xc0) == 0x80 &&
164 (utf8[3] & 0xc0) == 0x80) {
165 *ucs4 =
166 ((utf8[0] & 0x07L) << 18) |
167 ((utf8[1] & 0x3fL) << 12) |
168 ((utf8[2] & 0x3fL) << 6) |
169 ((utf8[3] & 0x3fL) << 0);
170 if (*ucs4 >= 0x00010000L)
171 return utf8 - start + 4;
172 }
173 else if ((utf8[0] & 0xfc) == 0xf8 &&
174 (utf8[1] & 0xc0) == 0x80 &&
175 (utf8[2] & 0xc0) == 0x80 &&
176 (utf8[3] & 0xc0) == 0x80 &&
177 (utf8[4] & 0xc0) == 0x80) {
178 *ucs4 =
179 ((utf8[0] & 0x03L) << 24) |
180 ((utf8[1] & 0x3fL) << 18) |
181 ((utf8[2] & 0x3fL) << 12) |
182 ((utf8[3] & 0x3fL) << 6) |
183 ((utf8[4] & 0x3fL) << 0);
184 if (*ucs4 >= 0x00200000L)
185 return utf8 - start + 5;
186 }
187 else if ((utf8[0] & 0xfe) == 0xfc &&
188 (utf8[1] & 0xc0) == 0x80 &&
189 (utf8[2] & 0xc0) == 0x80 &&
190 (utf8[3] & 0xc0) == 0x80 &&
191 (utf8[4] & 0xc0) == 0x80 &&
192 (utf8[5] & 0xc0) == 0x80) {
193 *ucs4 =
194 ((utf8[0] & 0x01L) << 30) |
195 ((utf8[1] & 0x3fL) << 24) |
196 ((utf8[2] & 0x3fL) << 18) |
197 ((utf8[3] & 0x3fL) << 12) |
198 ((utf8[4] & 0x3fL) << 6) |
199 ((utf8[5] & 0x3fL) << 0);
200 if (*ucs4 >= 0x04000000L)
201 return utf8 - start + 6;
202 }
203
204 ++utf8;
205 }
206 }
207
208 /*
209 * NAME: utf8->encodechar()
210 * DESCRIPTION: encode a single ucs4 char into a series of up to 6 utf8 chars
211 */
212 id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4)
213 {
214 if (ucs4 <= 0x0000007fL) {
215 utf8[0] = ucs4;
216
217 return 1;
218 }
219 else if (ucs4 <= 0x000007ffL) {
220 utf8[0] = 0xc0 | ((ucs4 >> 6) & 0x1f);
221 utf8[1] = 0x80 | ((ucs4 >> 0) & 0x3f);
222
223 return 2;
224 }
225 else if (ucs4 <= 0x0000ffffL) {
226 utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f);
227 utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
228 utf8[2] = 0x80 | ((ucs4 >> 0) & 0x3f);
229
230 return 3;
231 }
232 else if (ucs4 <= 0x001fffffL) {
233 utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07);
234 utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
235 utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
236 utf8[3] = 0x80 | ((ucs4 >> 0) & 0x3f);
237
238 return 4;
239 }
240 else if (ucs4 <= 0x03ffffffL) {
241 utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03);
242 utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f);
243 utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f);
244 utf8[3] = 0x80 | ((ucs4 >> 6) & 0x3f);
245 utf8[4] = 0x80 | ((ucs4 >> 0) & 0x3f);
246
247 return 5;
248 }
249 else if (ucs4 <= 0x7fffffffL) {
250 utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01);
251 utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f);
252 utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f);
253 utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f);
254 utf8[4] = 0x80 | ((ucs4 >> 6) & 0x3f);
255 utf8[5] = 0x80 | ((ucs4 >> 0) & 0x3f);
256
257 return 6;
258 }
259
260 /* default */
261
262 return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR);
263 }
264
265 /*
266 * NAME: utf8->decode()
267 * DESCRIPTION: decode a complete utf8 string into a ucs4 string
268 */
269 void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
270 {
271 do
272 utf8 += id3_utf8_decodechar(utf8, ucs4);
273 while (*ucs4++);
274 }
275
276 /*
277 * NAME: utf8->encode()
278 * DESCRIPTION: encode a complete ucs4 string into a utf8 string
279 */
280 void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4)
281 {
282 do
283 utf8 += id3_utf8_encodechar(utf8, *ucs4);
284 while (*ucs4++);
285 }
286
287 /*
288 * NAME: utf8->put()
289 * DESCRIPTION: serialize a single utf8 character
290 */
291 id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8)
292 {
293 if (ptr)
294 *(*ptr)++ = utf8;
295
296 return 1;
297 }
298
299 /*
300 * NAME: utf8->get()
301 * DESCRIPTION: deserialize a single utf8 character
302 */
303 id3_utf8_t id3_utf8_get(id3_byte_t const **ptr)
304 {
305 return *(*ptr)++;
306 }
307
308 /*
309 * NAME: utf8->serialize()
310 * DESCRIPTION: serialize a ucs4 string using utf8 encoding
311 */
312 id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4,
313 int terminate)
314 {
315 id3_length_t size = 0;
316 id3_utf8_t utf8[6], *out;
317
318 while (*ucs4) {
319 switch (id3_utf8_encodechar(out = utf8, *ucs4++)) {
320 case 6: size += id3_utf8_put(ptr, *out++);
321 case 5: size += id3_utf8_put(ptr, *out++);
322 case 4: size += id3_utf8_put(ptr, *out++);
323 case 3: size += id3_utf8_put(ptr, *out++);
324 case 2: size += id3_utf8_put(ptr, *out++);
325 case 1: size += id3_utf8_put(ptr, *out++);
326 case 0: break;
327 }
328 }
329
330 if (terminate)
331 size += id3_utf8_put(ptr, 0);
332
333 return size;
334 }
335
336 /*
337 * NAME: utf8->deserialize()
338 * DESCRIPTION: deserialize a ucs4 string using utf8 encoding
339 */
340 id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length)
341 {
342 id3_byte_t const *end;
343 id3_utf8_t *utf8ptr, *utf8;
344 id3_ucs4_t *ucs4;
345
346 end = *ptr + length;
347
348 utf8 = malloc((length + 1) * sizeof(*utf8));
349 if (utf8 == 0)
350 return 0;
351
352 utf8ptr = utf8;
353 while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr)))
354 ++utf8ptr;
355
356 *utf8ptr = 0;
357
358 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
359 if (ucs4)
360 id3_utf8_decode(utf8, ucs4);
361
362 free(utf8);
363
364 return ucs4;
365 }