Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * unicode.c | |
3 | * | |
4 | * PURPOSE | |
5 | * Routines for converting between UTF-8 and OSTA Compressed Unicode. | |
6 | * Also handles filename mangling | |
7 | * | |
8 | * DESCRIPTION | |
9 | * OSTA Compressed Unicode is explained in the OSTA UDF specification. | |
10 | * http://www.osta.org/ | |
11 | * UTF-8 is explained in the IETF RFC XXXX. | |
12 | * ftp://ftp.internic.net/rfc/rfcxxxx.txt | |
13 | * | |
1da177e4 LT |
14 | * COPYRIGHT |
15 | * This file is distributed under the terms of the GNU General Public | |
16 | * License (GPL). Copies of the GPL can be obtained from: | |
17 | * ftp://prep.ai.mit.edu/pub/gnu/GPL | |
18 | * Each contributing author retains all rights to their own work. | |
19 | */ | |
20 | ||
21 | #include "udfdecl.h" | |
22 | ||
23 | #include <linux/kernel.h> | |
24 | #include <linux/string.h> /* for memset */ | |
25 | #include <linux/nls.h> | |
26 | #include <linux/udf_fs.h> | |
27 | ||
28 | #include "udf_sb.h" | |
29 | ||
30 | static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int); | |
31 | ||
32 | static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) | |
33 | { | |
34 | if ( (!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN-2) ) | |
35 | return 0; | |
36 | memset(dest, 0, sizeof(struct ustr)); | |
37 | memcpy(dest->u_name, src, strlen); | |
38 | dest->u_cmpID = 0x08; | |
39 | dest->u_len = strlen; | |
40 | return strlen; | |
41 | } | |
42 | ||
43 | /* | |
44 | * udf_build_ustr | |
45 | */ | |
46 | int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) | |
47 | { | |
48 | int usesize; | |
49 | ||
50 | if ( (!dest) || (!ptr) || (!size) ) | |
51 | return -1; | |
52 | ||
53 | memset(dest, 0, sizeof(struct ustr)); | |
54 | usesize= (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size; | |
55 | dest->u_cmpID=ptr[0]; | |
56 | dest->u_len=ptr[size-1]; | |
57 | memcpy(dest->u_name, ptr+1, usesize-1); | |
58 | return 0; | |
59 | } | |
60 | ||
61 | /* | |
62 | * udf_build_ustr_exact | |
63 | */ | |
64 | static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) | |
65 | { | |
66 | if ( (!dest) || (!ptr) || (!exactsize) ) | |
67 | return -1; | |
68 | ||
69 | memset(dest, 0, sizeof(struct ustr)); | |
70 | dest->u_cmpID=ptr[0]; | |
71 | dest->u_len=exactsize-1; | |
72 | memcpy(dest->u_name, ptr+1, exactsize-1); | |
73 | return 0; | |
74 | } | |
75 | ||
76 | /* | |
77 | * udf_ocu_to_utf8 | |
78 | * | |
79 | * PURPOSE | |
80 | * Convert OSTA Compressed Unicode to the UTF-8 equivalent. | |
81 | * | |
82 | * DESCRIPTION | |
83 | * This routine is only called by udf_filldir(). | |
84 | * | |
85 | * PRE-CONDITIONS | |
86 | * utf Pointer to UTF-8 output buffer. | |
87 | * ocu Pointer to OSTA Compressed Unicode input buffer | |
88 | * of size UDF_NAME_LEN bytes. | |
89 | * both of type "struct ustr *" | |
90 | * | |
91 | * POST-CONDITIONS | |
92 | * <return> Zero on success. | |
93 | * | |
94 | * HISTORY | |
95 | * November 12, 1997 - Andrew E. Mileski | |
96 | * Written, tested, and released. | |
97 | */ | |
98 | int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i) | |
99 | { | |
100 | uint8_t *ocu; | |
101 | uint32_t c; | |
102 | uint8_t cmp_id, ocu_len; | |
103 | int i; | |
104 | ||
105 | ocu = ocu_i->u_name; | |
106 | ||
107 | ocu_len = ocu_i->u_len; | |
108 | cmp_id = ocu_i->u_cmpID; | |
109 | utf_o->u_len = 0; | |
110 | ||
111 | if (ocu_len == 0) | |
112 | { | |
113 | memset(utf_o, 0, sizeof(struct ustr)); | |
114 | utf_o->u_cmpID = 0; | |
115 | utf_o->u_len = 0; | |
116 | return 0; | |
117 | } | |
118 | ||
119 | if ((cmp_id != 8) && (cmp_id != 16)) | |
120 | { | |
121 | printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); | |
122 | return 0; | |
123 | } | |
124 | ||
125 | for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;) | |
126 | { | |
127 | ||
128 | /* Expand OSTA compressed Unicode to Unicode */ | |
129 | c = ocu[i++]; | |
130 | if (cmp_id == 16) | |
131 | c = (c << 8) | ocu[i++]; | |
132 | ||
133 | /* Compress Unicode to UTF-8 */ | |
134 | if (c < 0x80U) | |
135 | utf_o->u_name[utf_o->u_len++] = (uint8_t)c; | |
136 | else if (c < 0x800U) | |
137 | { | |
138 | utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6)); | |
139 | utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); | |
140 | } | |
141 | else | |
142 | { | |
143 | utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12)); | |
144 | utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | ((c >> 6) & 0x3f)); | |
145 | utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); | |
146 | } | |
147 | } | |
148 | utf_o->u_cmpID=8; | |
149 | ||
150 | return utf_o->u_len; | |
151 | } | |
152 | ||
153 | /* | |
154 | * | |
155 | * udf_utf8_to_ocu | |
156 | * | |
157 | * PURPOSE | |
158 | * Convert UTF-8 to the OSTA Compressed Unicode equivalent. | |
159 | * | |
160 | * DESCRIPTION | |
161 | * This routine is only called by udf_lookup(). | |
162 | * | |
163 | * PRE-CONDITIONS | |
164 | * ocu Pointer to OSTA Compressed Unicode output | |
165 | * buffer of size UDF_NAME_LEN bytes. | |
166 | * utf Pointer to UTF-8 input buffer. | |
167 | * utf_len Length of UTF-8 input buffer in bytes. | |
168 | * | |
169 | * POST-CONDITIONS | |
170 | * <return> Zero on success. | |
171 | * | |
172 | * HISTORY | |
173 | * November 12, 1997 - Andrew E. Mileski | |
174 | * Written, tested, and released. | |
175 | */ | |
176 | static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length) | |
177 | { | |
178 | unsigned c, i, max_val, utf_char; | |
179 | int utf_cnt, u_len; | |
180 | ||
181 | memset(ocu, 0, sizeof(dstring) * length); | |
182 | ocu[0] = 8; | |
183 | max_val = 0xffU; | |
184 | ||
185 | try_again: | |
186 | u_len = 0U; | |
187 | utf_char = 0U; | |
188 | utf_cnt = 0U; | |
189 | for (i = 0U; i < utf->u_len; i++) | |
190 | { | |
191 | c = (uint8_t)utf->u_name[i]; | |
192 | ||
193 | /* Complete a multi-byte UTF-8 character */ | |
194 | if (utf_cnt) | |
195 | { | |
196 | utf_char = (utf_char << 6) | (c & 0x3fU); | |
197 | if (--utf_cnt) | |
198 | continue; | |
199 | } | |
200 | else | |
201 | { | |
202 | /* Check for a multi-byte UTF-8 character */ | |
203 | if (c & 0x80U) | |
204 | { | |
205 | /* Start a multi-byte UTF-8 character */ | |
206 | if ((c & 0xe0U) == 0xc0U) | |
207 | { | |
208 | utf_char = c & 0x1fU; | |
209 | utf_cnt = 1; | |
210 | } | |
211 | else if ((c & 0xf0U) == 0xe0U) | |
212 | { | |
213 | utf_char = c & 0x0fU; | |
214 | utf_cnt = 2; | |
215 | } | |
216 | else if ((c & 0xf8U) == 0xf0U) | |
217 | { | |
218 | utf_char = c & 0x07U; | |
219 | utf_cnt = 3; | |
220 | } | |
221 | else if ((c & 0xfcU) == 0xf8U) | |
222 | { | |
223 | utf_char = c & 0x03U; | |
224 | utf_cnt = 4; | |
225 | } | |
226 | else if ((c & 0xfeU) == 0xfcU) | |
227 | { | |
228 | utf_char = c & 0x01U; | |
229 | utf_cnt = 5; | |
230 | } | |
231 | else | |
232 | goto error_out; | |
233 | continue; | |
234 | } else | |
235 | /* Single byte UTF-8 character (most common) */ | |
236 | utf_char = c; | |
237 | } | |
238 | ||
239 | /* Choose no compression if necessary */ | |
240 | if (utf_char > max_val) | |
241 | { | |
242 | if ( 0xffU == max_val ) | |
243 | { | |
244 | max_val = 0xffffU; | |
245 | ocu[0] = (uint8_t)0x10U; | |
246 | goto try_again; | |
247 | } | |
248 | goto error_out; | |
249 | } | |
250 | ||
251 | if (max_val == 0xffffU) | |
252 | { | |
253 | ocu[++u_len] = (uint8_t)(utf_char >> 8); | |
254 | } | |
255 | ocu[++u_len] = (uint8_t)(utf_char & 0xffU); | |
256 | } | |
257 | ||
258 | ||
259 | if (utf_cnt) | |
260 | { | |
261 | error_out: | |
262 | ocu[++u_len] = '?'; | |
263 | printk(KERN_DEBUG "udf: bad UTF-8 character\n"); | |
264 | } | |
265 | ||
266 | ocu[length - 1] = (uint8_t)u_len + 1; | |
267 | return u_len + 1; | |
268 | } | |
269 | ||
270 | static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, struct ustr *ocu_i) | |
271 | { | |
272 | uint8_t *ocu; | |
273 | uint32_t c; | |
274 | uint8_t cmp_id, ocu_len; | |
275 | int i; | |
276 | ||
277 | ocu = ocu_i->u_name; | |
278 | ||
279 | ocu_len = ocu_i->u_len; | |
280 | cmp_id = ocu_i->u_cmpID; | |
281 | utf_o->u_len = 0; | |
282 | ||
283 | if (ocu_len == 0) | |
284 | { | |
285 | memset(utf_o, 0, sizeof(struct ustr)); | |
286 | utf_o->u_cmpID = 0; | |
287 | utf_o->u_len = 0; | |
288 | return 0; | |
289 | } | |
290 | ||
291 | if ((cmp_id != 8) && (cmp_id != 16)) | |
292 | { | |
293 | printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); | |
294 | return 0; | |
295 | } | |
296 | ||
297 | for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;) | |
298 | { | |
299 | /* Expand OSTA compressed Unicode to Unicode */ | |
300 | c = ocu[i++]; | |
301 | if (cmp_id == 16) | |
302 | c = (c << 8) | ocu[i++]; | |
303 | ||
304 | utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], | |
305 | UDF_NAME_LEN - utf_o->u_len); | |
306 | } | |
307 | utf_o->u_cmpID=8; | |
308 | ||
309 | return utf_o->u_len; | |
310 | } | |
311 | ||
312 | static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length) | |
313 | { | |
314 | unsigned len, i, max_val; | |
315 | uint16_t uni_char; | |
316 | int u_len; | |
317 | ||
318 | memset(ocu, 0, sizeof(dstring) * length); | |
319 | ocu[0] = 8; | |
320 | max_val = 0xffU; | |
321 | ||
322 | try_again: | |
323 | u_len = 0U; | |
324 | for (i = 0U; i < uni->u_len; i++) | |
325 | { | |
326 | len = nls->char2uni(&uni->u_name[i], uni->u_len-i, &uni_char); | |
327 | if (len <= 0) | |
328 | continue; | |
329 | ||
330 | if (uni_char > max_val) | |
331 | { | |
332 | max_val = 0xffffU; | |
333 | ocu[0] = (uint8_t)0x10U; | |
334 | goto try_again; | |
335 | } | |
336 | ||
337 | if (max_val == 0xffffU) | |
338 | ocu[++u_len] = (uint8_t)(uni_char >> 8); | |
339 | ocu[++u_len] = (uint8_t)(uni_char & 0xffU); | |
340 | i += len - 1; | |
341 | } | |
342 | ||
343 | ocu[length - 1] = (uint8_t)u_len + 1; | |
344 | return u_len + 1; | |
345 | } | |
346 | ||
347 | int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, int flen) | |
348 | { | |
349 | struct ustr filename, unifilename; | |
350 | int len; | |
351 | ||
352 | if (udf_build_ustr_exact(&unifilename, sname, flen)) | |
353 | { | |
354 | return 0; | |
355 | } | |
356 | ||
357 | if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) | |
358 | { | |
359 | if (!udf_CS0toUTF8(&filename, &unifilename) ) | |
360 | { | |
361 | udf_debug("Failed in udf_get_filename: sname = %s\n", sname); | |
362 | return 0; | |
363 | } | |
364 | } | |
365 | else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) | |
366 | { | |
367 | if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, &unifilename) ) | |
368 | { | |
369 | udf_debug("Failed in udf_get_filename: sname = %s\n", sname); | |
370 | return 0; | |
371 | } | |
372 | } | |
373 | else | |
374 | return 0; | |
375 | ||
376 | if ((len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, | |
377 | unifilename.u_name, unifilename.u_len))) | |
378 | { | |
379 | return len; | |
380 | } | |
381 | return 0; | |
382 | } | |
383 | ||
384 | int udf_put_filename(struct super_block *sb, const uint8_t *sname, uint8_t *dname, int flen) | |
385 | { | |
386 | struct ustr unifilename; | |
387 | int namelen; | |
388 | ||
389 | if ( !(udf_char_to_ustr(&unifilename, sname, flen)) ) | |
390 | { | |
391 | return 0; | |
392 | } | |
393 | ||
394 | if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) | |
395 | { | |
396 | if ( !(namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN)) ) | |
397 | { | |
398 | return 0; | |
399 | } | |
400 | } | |
401 | else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) | |
402 | { | |
403 | if ( !(namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, &unifilename, UDF_NAME_LEN)) ) | |
404 | { | |
405 | return 0; | |
406 | } | |
407 | } | |
408 | else | |
409 | return 0; | |
410 | ||
411 | return namelen; | |
412 | } | |
413 | ||
414 | #define ILLEGAL_CHAR_MARK '_' | |
415 | #define EXT_MARK '.' | |
416 | #define CRC_MARK '#' | |
417 | #define EXT_SIZE 5 | |
418 | ||
419 | static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, int udfLen, uint8_t *fidName, int fidNameLen) | |
420 | { | |
421 | int index, newIndex = 0, needsCRC = 0; | |
422 | int extIndex = 0, newExtIndex = 0, hasExt = 0; | |
423 | unsigned short valueCRC; | |
424 | uint8_t curr; | |
425 | const uint8_t hexChar[] = "0123456789ABCDEF"; | |
426 | ||
427 | if (udfName[0] == '.' && (udfLen == 1 || | |
428 | (udfLen == 2 && udfName[1] == '.'))) | |
429 | { | |
430 | needsCRC = 1; | |
431 | newIndex = udfLen; | |
432 | memcpy(newName, udfName, udfLen); | |
433 | } | |
434 | else | |
435 | { | |
436 | for (index = 0; index < udfLen; index++) | |
437 | { | |
438 | curr = udfName[index]; | |
439 | if (curr == '/' || curr == 0) | |
440 | { | |
441 | needsCRC = 1; | |
442 | curr = ILLEGAL_CHAR_MARK; | |
443 | while (index+1 < udfLen && (udfName[index+1] == '/' || | |
444 | udfName[index+1] == 0)) | |
445 | index++; | |
446 | } | |
447 | if (curr == EXT_MARK && (udfLen - index - 1) <= EXT_SIZE) | |
448 | { | |
449 | if (udfLen == index + 1) | |
450 | hasExt = 0; | |
451 | else | |
452 | { | |
453 | hasExt = 1; | |
454 | extIndex = index; | |
455 | newExtIndex = newIndex; | |
456 | } | |
457 | } | |
458 | if (newIndex < 256) | |
459 | newName[newIndex++] = curr; | |
460 | else | |
461 | needsCRC = 1; | |
462 | } | |
463 | } | |
464 | if (needsCRC) | |
465 | { | |
466 | uint8_t ext[EXT_SIZE]; | |
467 | int localExtIndex = 0; | |
468 | ||
469 | if (hasExt) | |
470 | { | |
471 | int maxFilenameLen; | |
472 | for(index = 0; index<EXT_SIZE && extIndex + index +1 < udfLen; | |
473 | index++ ) | |
474 | { | |
475 | curr = udfName[extIndex + index + 1]; | |
476 | ||
477 | if (curr == '/' || curr == 0) | |
478 | { | |
479 | needsCRC = 1; | |
480 | curr = ILLEGAL_CHAR_MARK; | |
481 | while(extIndex + index + 2 < udfLen && (index + 1 < EXT_SIZE | |
482 | && (udfName[extIndex + index + 2] == '/' || | |
483 | udfName[extIndex + index + 2] == 0))) | |
484 | index++; | |
485 | } | |
486 | ext[localExtIndex++] = curr; | |
487 | } | |
488 | maxFilenameLen = 250 - localExtIndex; | |
489 | if (newIndex > maxFilenameLen) | |
490 | newIndex = maxFilenameLen; | |
491 | else | |
492 | newIndex = newExtIndex; | |
493 | } | |
494 | else if (newIndex > 250) | |
495 | newIndex = 250; | |
496 | newName[newIndex++] = CRC_MARK; | |
497 | valueCRC = udf_crc(fidName, fidNameLen, 0); | |
498 | newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; | |
499 | newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; | |
500 | newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; | |
501 | newName[newIndex++] = hexChar[(valueCRC & 0x000f)]; | |
502 | ||
503 | if (hasExt) | |
504 | { | |
505 | newName[newIndex++] = EXT_MARK; | |
506 | for (index = 0;index < localExtIndex ;index++ ) | |
507 | newName[newIndex++] = ext[index]; | |
508 | } | |
509 | } | |
510 | return newIndex; | |
511 | } |