Improve internal data structures. New extra field API (read only). Added paranoid mode to zipcmp (compare more meta data). Keep up-to-date metadata for changed entries, avoid parallel arrays. XXX: Modifying extra fields API functions are stubs. Old API removed.
diff --git a/lib/zip_utf-8.c b/lib/zip_utf-8.c index d184b17..4b8e4b1 100644 --- a/lib/zip_utf-8.c +++ b/lib/zip_utf-8.c
@@ -118,39 +118,65 @@ enum zip_encoding_type -_zip_guess_encoding(const char * const _name, zip_uint32_t len) +_zip_guess_encoding(struct zip_string *str, enum zip_encoding_type expected_encoding) { - zip_uint8_t *name = (zip_uint8_t *)_name; + enum zip_encoding_type enc; + const zip_uint8_t *name; zip_uint32_t i; - int ret; int j, ulen; - ret = ZIP_ENCODING_ASCII; - for (i=0; i<len; i++) { - if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t') - continue; + if (str == NULL) + return ZIP_ENCODING_ASCII; - ret = ZIP_ENCODING_UTF8; - if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH) - ulen = 1; - else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH) - ulen = 2; - else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH) - ulen = 3; - else - return ZIP_ENCODING_CP437; + name = str->raw; - if (i + ulen >= len) - return ZIP_ENCODING_CP437; + if (str->encoding != ZIP_ENCODING_UNKNOWN) + enc = str->encoding; + else { + enc = ZIP_ENCODING_ASCII; + for (i=0; i<str->length; i++) { + if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t') + continue; - for (j=1; j<=ulen; j++) { - if ((name[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) - return ZIP_ENCODING_CP437; + enc = ZIP_ENCODING_UTF8_GUESSED; + if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH) + ulen = 1; + else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH) + ulen = 2; + else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH) + ulen = 3; + else { + enc = ZIP_ENCODING_CP437; + break; + } + + if (i + ulen >= str->length) { + enc = ZIP_ENCODING_CP437; + break; + } + + for (j=1; j<=ulen; j++) { + if ((name[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) { + enc = ZIP_ENCODING_CP437; + goto done; + } + } + i += ulen; } - i += ulen; } - return ret; +done: + str->encoding = enc; + + if (expected_encoding != ZIP_ENCODING_UNKNOWN) { + if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED) + str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN; + + if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII) + return ZIP_ENCODING_ERROR; + } + + return enc; } @@ -196,8 +222,8 @@ -char * -_zip_cp437_to_utf8(const char * const _cp437buf, zip_uint32_t len, +zip_uint8_t * +_zip_cp437_to_utf8(const zip_uint8_t * const _cp437buf, zip_uint32_t len, zip_uint32_t *utf8_lenp, struct zip_error *error) { zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf; @@ -227,5 +253,5 @@ utf8buf[buflen-1] = 0; if (utf8_lenp) *utf8_lenp = buflen-1; - return (char *)utf8buf; + return utf8buf; }