VeraCrypt
aboutsummaryrefslogtreecommitdiff
path: root/src/Common/libzip/zip_utf-8.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/Common/libzip/zip_utf-8.c')
-rw-r--r--src/Common/libzip/zip_utf-8.c146
1 files changed, 102 insertions, 44 deletions
diff --git a/src/Common/libzip/zip_utf-8.c b/src/Common/libzip/zip_utf-8.c
index 678912f6..7d671f60 100644
--- a/src/Common/libzip/zip_utf-8.c
+++ b/src/Common/libzip/zip_utf-8.c
@@ -1,7 +1,7 @@
/*
zip_utf-8.c -- UTF-8 support functions for libzip
- Copyright (C) 2011-2021 Dieter Baron and Thomas Klausner
+ Copyright (C) 2011-2024 Dieter Baron and Thomas Klausner
This file is part of libzip, a library to manipulate ZIP archives.
The authors can be contacted at <info@libzip.org>
@@ -100,72 +100,126 @@ zip_encoding_type_t
_zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding) {
zip_encoding_type_t enc;
const zip_uint8_t *name;
zip_uint32_t i, j, ulen;
+ bool can_be_ascii = true;
+ bool can_be_utf8 = true;
+ bool has_control_characters = false;
- if (str == NULL)
+ if (str == NULL) {
return ZIP_ENCODING_ASCII;
+ }
name = str->raw;
- if (str->encoding != ZIP_ENCODING_UNKNOWN)
- enc = str->encoding;
- else {
- enc = ZIP_ENCODING_ASCII;
- for (i = 0; i < str->length; i++) {
- if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t')
- continue;
-
- enc = ZIP_ENCODING_UTF8_GUESSED;
- if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH)
- ulen = 1;
- else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
- ulen = 2;
- else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
- ulen = 3;
- else {
- enc = ZIP_ENCODING_CP437;
- break;
- }
+ if (str->encoding != ZIP_ENCODING_UNKNOWN) {
+ return str->encoding;
+ }
- if (i + ulen >= str->length) {
- enc = ZIP_ENCODING_CP437;
- break;
+ for (i = 0; i < str->length; i++) {
+ if (name[i] < 128) {
+ if (name[i] < 32 && name[i] != '\r' && name[i] != '\n' && name[i] != '\t') {
+ has_control_characters = true;
}
+ continue;
+ }
+
+ can_be_ascii = false;
+ if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH) {
+ ulen = 1;
+ }
+ else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH) {
+ ulen = 2;
+ }
+ else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH) {
+ ulen = 3;
+ }
+ else {
+ can_be_utf8 = false;
+ break;
+ }
- for (j = 1; j <= ulen; j++) {
- if ((name[i + j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
- enc = ZIP_ENCODING_CP437;
- goto done;
- }
+ if (i + ulen >= str->length) {
+ can_be_utf8 = false;
+ break;
+ }
+
+ for (j = 1; j <= ulen; j++) {
+ if ((name[i + j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
+ can_be_utf8 = false;
+ goto done;
}
- i += ulen;
}
+ i += ulen;
}
-done:
- str->encoding = enc;
+ done:
+ enc = ZIP_ENCODING_CP437;
+
+ switch (expected_encoding) {
+ case ZIP_ENCODING_UTF8_KNOWN:
+ case ZIP_ENCODING_UTF8_GUESSED:
+ if (can_be_utf8) {
+ enc = ZIP_ENCODING_UTF8_KNOWN;
+ }
+ else {
+ enc = ZIP_ENCODING_ERROR;
+ }
+ break;
- if (expected_encoding != ZIP_ENCODING_UNKNOWN) {
- if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED)
- str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN;
+ case ZIP_ENCODING_ASCII:
+ if (can_be_ascii && !has_control_characters) {
+ enc = ZIP_ENCODING_ASCII;
+ }
+ else {
+ enc = ZIP_ENCODING_ERROR;
+ }
+ break;
- if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII)
- return ZIP_ENCODING_ERROR;
+ case ZIP_ENCODING_CP437:
+ enc = ZIP_ENCODING_CP437;
+ break;
+
+ case ZIP_ENCODING_UNKNOWN:
+ if (can_be_ascii && !has_control_characters) {
+ /* only bytes from 0x20-0x7F */
+ enc = ZIP_ENCODING_ASCII;
+ }
+ else if (can_be_ascii && has_control_characters) {
+ /* only bytes from 0x00-0x7F */
+ enc = ZIP_ENCODING_CP437;
+ }
+ else if (can_be_utf8) {
+ /* contains bytes from 0x80-0xFF and is valid UTF-8 */
+ enc = ZIP_ENCODING_UTF8_GUESSED;
+ }
+ else {
+ /* fallback */
+ enc = ZIP_ENCODING_CP437;
+ }
+ break;
+ case ZIP_ENCODING_ERROR:
+ /* invalid, shouldn't happen */
+ enc = ZIP_ENCODING_ERROR;
+ break;
}
+ str->encoding = enc;
return enc;
}
static zip_uint32_t
_zip_unicode_to_utf8_len(zip_uint32_t codepoint) {
- if (codepoint < 0x0080)
+ if (codepoint < 0x0080) {
return 1;
- if (codepoint < 0x0800)
+ }
+ if (codepoint < 0x0800) {
return 2;
- if (codepoint < 0x10000)
+ }
+ if (codepoint < 0x10000) {
return 3;
+ }
return 4;
}
@@ -200,27 +254,31 @@ _zip_cp437_to_utf8(const zip_uint8_t *const _cp437buf, zip_uint32_t len, zip_uin
zip_uint8_t *utf8buf;
zip_uint32_t buflen, i, offset;
if (len == 0) {
- if (utf8_lenp)
+ if (utf8_lenp) {
*utf8_lenp = 0;
+ }
return NULL;
}
buflen = 1;
- for (i = 0; i < len; i++)
+ for (i = 0; i < len; i++) {
buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
+ }
if ((utf8buf = (zip_uint8_t *)malloc(buflen)) == NULL) {
zip_error_set(error, ZIP_ER_MEMORY, 0);
return NULL;
}
offset = 0;
- for (i = 0; i < len; i++)
+ for (i = 0; i < len; i++) {
offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]], utf8buf + offset);
+ }
utf8buf[buflen - 1] = 0;
- if (utf8_lenp)
+ if (utf8_lenp) {
*utf8_lenp = buflen - 1;
+ }
return utf8buf;
}