blob: f38eea04d27bbe811f94f45bf0de88805c9c4dc9 [file] [log] [blame]
Thomas Klausnera70ca5d2011-04-09 00:34:27 +02001/*
2 zip_utf-8.c -- UTF-8 support functions for libzip
Thomas Klausnerea8ba492014-09-23 16:54:47 +02003 Copyright (C) 2011-2014 Dieter Baron and Thomas Klausner
Thomas Klausnera70ca5d2011-04-09 00:34:27 +02004
5 This file is part of libzip, a library to manipulate ZIP archives.
6 The authors can be contacted at <libzip@nih.at>
7
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions
10 are met:
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13 2. Redistributions in binary form must reproduce the above copyright
14 notice, this list of conditions and the following disclaimer in
15 the documentation and/or other materials provided with the
16 distribution.
17 3. The names of the authors may not be used to endorse or promote
18 products derived from this software without specific prior
19 written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
22 OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
25 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29 IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31 IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32*/
33
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020034
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020035#include "zipint.h"
36
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020037#include <stdlib.h>
38
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020039
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020040static const zip_uint16_t _cp437_to_unicode[256] = {
41 /* 0x00 - 0x0F */
42 0x2007, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020043 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
44
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020045 /* 0x10 - 0x1F */
46 0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020047 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
48
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020049 /* 0x20 - 0x2F */
50 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020051 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
52
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020053 /* 0x30 - 0x3F */
54 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020055 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
56
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020057 /* 0x40 - 0x4F */
58 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020059 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
60
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020061 /* 0x50 - 0x5F */
62 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020063 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
64
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020065 /* 0x60 - 0x6F */
66 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020067 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
68
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020069 /* 0x70 - 0x7F */
70 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020071 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
72
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020073 /* 0x80 - 0x8F */
74 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020075 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
76
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020077 /* 0x90 - 0x9F */
78 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020079 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
80
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020081 /* 0xA0 - 0xAF */
82 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020083 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
84
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020085 /* 0xB0 - 0xBF */
86 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020087 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
88
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020089 /* 0xC0 - 0xCF */
90 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020091 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
92
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020093 /* 0xD0 - 0xDF */
94 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020095 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
96
Thomas Klausner4e6e1da2011-04-09 12:52:07 +020097 /* 0xE0 - 0xEF */
98 0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +020099 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
100
Thomas Klausner4e6e1da2011-04-09 12:52:07 +0200101 /* 0xF0 - 0xFF */
102 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200103 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0
104};
105
106#define UTF_8_LEN_2_MASK 0xe0
107#define UTF_8_LEN_2_MATCH 0xc0
108#define UTF_8_LEN_3_MASK 0xf0
109#define UTF_8_LEN_3_MATCH 0xe0
110#define UTF_8_LEN_4_MASK 0xf8
111#define UTF_8_LEN_4_MATCH 0xf0
112#define UTF_8_CONTINUE_MASK 0xc0
113#define UTF_8_CONTINUE_MATCH 0x80
114
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200115
Dieter Baron1d9dfeb2014-09-28 23:02:54 +0200116zip_encoding_type_t
117_zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding)
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200118{
Dieter Baron1d9dfeb2014-09-28 23:02:54 +0200119 zip_encoding_type_t enc;
Dieter Baron0e5eeab2012-04-24 18:47:12 +0200120 const zip_uint8_t *name;
Dieter Baron18d6b9e2012-07-15 15:58:57 +0200121 zip_uint32_t i, j, ulen;
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200122
Dieter Baron0e5eeab2012-04-24 18:47:12 +0200123 if (str == NULL)
124 return ZIP_ENCODING_ASCII;
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200125
Dieter Baron0e5eeab2012-04-24 18:47:12 +0200126 name = str->raw;
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200127
Dieter Baron0e5eeab2012-04-24 18:47:12 +0200128 if (str->encoding != ZIP_ENCODING_UNKNOWN)
129 enc = str->encoding;
130 else {
131 enc = ZIP_ENCODING_ASCII;
132 for (i=0; i<str->length; i++) {
133 if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t')
134 continue;
Dieter Baron5fc60142011-04-12 22:55:55 +0200135
Dieter Baron0e5eeab2012-04-24 18:47:12 +0200136 enc = ZIP_ENCODING_UTF8_GUESSED;
137 if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH)
138 ulen = 1;
139 else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
140 ulen = 2;
141 else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
142 ulen = 3;
143 else {
144 enc = ZIP_ENCODING_CP437;
145 break;
146 }
147
148 if (i + ulen >= str->length) {
149 enc = ZIP_ENCODING_CP437;
150 break;
151 }
152
153 for (j=1; j<=ulen; j++) {
154 if ((name[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
155 enc = ZIP_ENCODING_CP437;
156 goto done;
157 }
158 }
159 i += ulen;
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200160 }
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200161 }
162
Dieter Baron0e5eeab2012-04-24 18:47:12 +0200163done:
164 str->encoding = enc;
165
166 if (expected_encoding != ZIP_ENCODING_UNKNOWN) {
167 if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED)
168 str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN;
169
170 if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII)
171 return ZIP_ENCODING_ERROR;
172 }
173
174 return enc;
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200175}
176
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200177
Thomas Klausner4e6e1da2011-04-09 12:52:07 +0200178static zip_uint32_t
179_zip_unicode_to_utf8_len(zip_uint32_t codepoint)
180{
181 if (codepoint < 0x0080)
182 return 1;
183 if (codepoint < 0x0800)
184 return 2;
Thomas Klausner708830d2011-04-13 13:42:10 +0200185 if (codepoint < 0x10000)
Thomas Klausner4e6e1da2011-04-09 12:52:07 +0200186 return 3;
187 return 4;
188}
189
Thomas Klausner4e6e1da2011-04-09 12:52:07 +0200190
191static zip_uint32_t
192_zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf)
193{
194 if (codepoint < 0x0080) {
195 buf[0] = codepoint & 0xff;
196 return 1;
197 }
198 if (codepoint < 0x0800) {
Thomas Klausner03ca1c12014-09-24 01:02:15 +0200199 buf[0] = (zip_uint8_t)(UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f));
200 buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
Thomas Klausner4e6e1da2011-04-09 12:52:07 +0200201 return 2;
202 }
Thomas Klausner708830d2011-04-13 13:42:10 +0200203 if (codepoint < 0x10000) {
Thomas Klausner03ca1c12014-09-24 01:02:15 +0200204 buf[0] = (zip_uint8_t)(UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f));
205 buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
206 buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
Thomas Klausner4e6e1da2011-04-09 12:52:07 +0200207 return 3;
208 }
Thomas Klausner03ca1c12014-09-24 01:02:15 +0200209 buf[0] = (zip_uint8_t)(UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07));
210 buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f));
211 buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
212 buf[3] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
Thomas Klausner4e6e1da2011-04-09 12:52:07 +0200213 return 4;
214}
215
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200216
Dieter Baron0e5eeab2012-04-24 18:47:12 +0200217zip_uint8_t *
218_zip_cp437_to_utf8(const zip_uint8_t * const _cp437buf, zip_uint32_t len,
Dieter Baron1d9dfeb2014-09-28 23:02:54 +0200219 zip_uint32_t *utf8_lenp, zip_error_t *error)
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200220{
Dieter Baronc61eb212012-03-15 12:07:04 +0100221 zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
Thomas Klausner4e6e1da2011-04-09 12:52:07 +0200222 zip_uint8_t *utf8buf;
223 zip_uint32_t buflen, i, offset;
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200224
Dieter Baron888ebfa2012-02-27 10:06:47 +0100225 if (len == 0) {
226 if (utf8_lenp)
227 *utf8_lenp = 0;
Thomas Klausner8d339582012-01-05 15:45:07 +0100228 return NULL;
Dieter Baron888ebfa2012-02-27 10:06:47 +0100229 }
Thomas Klausner8d339582012-01-05 15:45:07 +0100230
Thomas Klausner13f7de12012-02-18 00:05:26 +0100231 buflen = 1;
Thomas Klausner4e6e1da2011-04-09 12:52:07 +0200232 for (i=0; i<len; i++)
233 buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200234
Thomas Klausner4e6e1da2011-04-09 12:52:07 +0200235 if ((utf8buf=(zip_uint8_t*)malloc(buflen)) == NULL) {
Thomas Klausnerea8ba492014-09-23 16:54:47 +0200236 zip_error_set(error, ZIP_ER_MEMORY, 0);
Thomas Klausner4e6e1da2011-04-09 12:52:07 +0200237 return NULL;
238 }
239
240 offset = 0;
241 for (i=0; i<len; i++)
242 offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]],
243 utf8buf+offset);
244
Thomas Klausner13f7de12012-02-18 00:05:26 +0100245 utf8buf[buflen-1] = 0;
Dieter Baron888ebfa2012-02-27 10:06:47 +0100246 if (utf8_lenp)
247 *utf8_lenp = buflen-1;
Dieter Baron0e5eeab2012-04-24 18:47:12 +0200248 return utf8buf;
Thomas Klausnera70ca5d2011-04-09 00:34:27 +0200249}