| Add code support for ICU. |
| |
| diff --git a/third_party/libxml/encoding.c b/third_party/libxml/encoding.c |
| index b86a547..0f41df9 100644 |
| --- a/third_party/libxml/encoding.c |
| +++ b/third_party/libxml/encoding.c |
| @@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; |
| static int xmlCharEncodingAliasesNb = 0; |
| static int xmlCharEncodingAliasesMax = 0; |
| |
| -#ifdef LIBXML_ICONV_ENABLED |
| +#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED) |
| #if 0 |
| #define DEBUG_ENCODING /* Define this to get encoding traces */ |
| #endif |
| @@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val) |
| NULL, 0, val, NULL, NULL, 0, 0, msg, val); |
| } |
| |
| +#ifdef LIBXML_ICU_ENABLED |
| +static uconv_t* |
| +openIcuConverter(const char* name, int toUnicode) |
| +{ |
| + UErrorCode status = U_ZERO_ERROR; |
| + uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t)); |
| + if (conv == NULL) |
| + return NULL; |
| + |
| + conv->uconv = ucnv_open(name, &status); |
| + if (U_FAILURE(status)) |
| + goto error; |
| + |
| + status = U_ZERO_ERROR; |
| + if (toUnicode) { |
| + ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, |
| + NULL, NULL, NULL, &status); |
| + } |
| + else { |
| + ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, |
| + NULL, NULL, NULL, &status); |
| + } |
| + if (U_FAILURE(status)) |
| + goto error; |
| + |
| + status = U_ZERO_ERROR; |
| + conv->utf8 = ucnv_open("UTF-8", &status); |
| + if (U_SUCCESS(status)) |
| + return conv; |
| + |
| +error: |
| + if (conv->uconv) |
| + ucnv_close(conv->uconv); |
| + xmlFree(conv); |
| + return NULL; |
| +} |
| + |
| +static void |
| +closeIcuConverter(uconv_t *conv) |
| +{ |
| + if (conv != NULL) { |
| + ucnv_close(conv->uconv); |
| + ucnv_close(conv->utf8); |
| + xmlFree(conv); |
| + } |
| +} |
| +#endif /* LIBXML_ICU_ENABLED */ |
| + |
| /************************************************************************ |
| * * |
| * Conversions To/From UTF8 encoding * |
| @@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name, |
| #ifdef LIBXML_ICONV_ENABLED |
| handler->iconv_in = NULL; |
| handler->iconv_out = NULL; |
| -#endif /* LIBXML_ICONV_ENABLED */ |
| +#endif |
| +#ifdef LIBXML_ICU_ENABLED |
| + handler->uconv_in = NULL; |
| + handler->uconv_out = NULL; |
| +#endif |
| |
| /* |
| * registers and returns the handler. |
| @@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) { |
| xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); |
| xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL); |
| #endif /* LIBXML_OUTPUT_ENABLED */ |
| -#ifndef LIBXML_ICONV_ENABLED |
| +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) |
| #ifdef LIBXML_ISO8859X_ENABLED |
| xmlRegisterCharEncodingHandlersISO8859x (); |
| #endif |
| @@ -1578,6 +1630,10 @@ xmlFindCharEncodingHandler(const char *name) { |
| xmlCharEncodingHandlerPtr enc; |
| iconv_t icv_in, icv_out; |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + xmlCharEncodingHandlerPtr enc; |
| + uconv_t *ucv_in, *ucv_out; |
| +#endif /* LIBXML_ICU_ENABLED */ |
| char upper[100]; |
| int i; |
| |
| @@ -1647,6 +1703,35 @@ xmlFindCharEncodingHandler(const char *name) { |
| "iconv : problems with filters for '%s'\n", name); |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + /* check whether icu can handle this */ |
| + ucv_in = openIcuConverter(name, 1); |
| + ucv_out = openIcuConverter(name, 0); |
| + if (ucv_in != NULL && ucv_out != NULL) { |
| + enc = (xmlCharEncodingHandlerPtr) |
| + xmlMalloc(sizeof(xmlCharEncodingHandler)); |
| + if (enc == NULL) { |
| + closeIcuConverter(ucv_in); |
| + closeIcuConverter(ucv_out); |
| + return(NULL); |
| + } |
| + enc->name = xmlMemStrdup(name); |
| + enc->input = NULL; |
| + enc->output = NULL; |
| + enc->uconv_in = ucv_in; |
| + enc->uconv_out = ucv_out; |
| +#ifdef DEBUG_ENCODING |
| + xmlGenericError(xmlGenericErrorContext, |
| + "Found ICU converter handler for encoding %s\n", name); |
| +#endif |
| + return enc; |
| + } else if (ucv_in != NULL || ucv_out != NULL) { |
| + closeIcuConverter(ucv_in); |
| + closeIcuConverter(ucv_out); |
| + xmlEncodingErr(XML_ERR_INTERNAL_ERROR, |
| + "ICU converter : problems with filters for '%s'\n", name); |
| + } |
| +#endif /* LIBXML_ICU_ENABLED */ |
| |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| @@ -1737,6 +1822,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, |
| |
| /************************************************************************ |
| * * |
| + * ICU based generic conversion functions * |
| + * * |
| + ************************************************************************/ |
| + |
| +#ifdef LIBXML_ICU_ENABLED |
| +/** |
| + * xmlUconvWrapper: |
| + * @cd: ICU uconverter data structure |
| + * @toUnicode : non-zero if toUnicode. 0 otherwise. |
| + * @out: a pointer to an array of bytes to store the result |
| + * @outlen: the length of @out |
| + * @in: a pointer to an array of ISO Latin 1 chars |
| + * @inlen: the length of @in |
| + * |
| + * Returns 0 if success, or |
| + * -1 by lack of space, or |
| + * -2 if the transcoding fails (for *in is not valid utf8 string or |
| + * the result of transformation can't fit into the encoding we want), or |
| + * -3 if there the last byte can't form a single output char. |
| + * |
| + * The value of @inlen after return is the number of octets consumed |
| + * as the return value is positive, else unpredictable. |
| + * The value of @outlen after return is the number of ocetes consumed. |
| + */ |
| +static int |
| +xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, |
| + const unsigned char *in, int *inlen) { |
| + const char *ucv_in = (const char *) in; |
| + char *ucv_out = (char *) out; |
| + UErrorCode err = U_ZERO_ERROR; |
| + |
| + if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { |
| + if (outlen != NULL) *outlen = 0; |
| + return(-1); |
| + } |
| + |
| + /* |
| + * TODO(jungshik) |
| + * 1. is ucnv_convert(To|From)Algorithmic better? |
| + * 2. had we better use an explicit pivot buffer? |
| + * 3. error returned comes from 'fromUnicode' only even |
| + * when toUnicode is true ! |
| + */ |
| + if (toUnicode) { |
| + /* encoding => UTF-16 => UTF-8 */ |
| + ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, |
| + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, |
| + 0, TRUE, &err); |
| + } else { |
| + /* UTF-8 => UTF-16 => encoding */ |
| + ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, |
| + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, |
| + 0, TRUE, &err); |
| + } |
| + *inlen = ucv_in - (const char*) in; |
| + *outlen = ucv_out - (char *) out; |
| + if (U_SUCCESS(err)) |
| + return 0; |
| + if (err == U_BUFFER_OVERFLOW_ERROR) |
| + return -1; |
| + if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND) |
| + return -2; |
| + /* if (err == U_TRUNCATED_CHAR_FOUND) */ |
| + return -3; |
| +} |
| +#endif /* LIBXML_ICU_ENABLED */ |
| + |
| +/************************************************************************ |
| + * * |
| * The real API used by libxml for on-the-fly conversion * |
| * * |
| ************************************************************************/ |
| @@ -1810,6 +1964,16 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, |
| if (ret == -1) ret = -3; |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + else if (handler->uconv_in != NULL) { |
| + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], |
| + &written, in->content, &toconv); |
| + xmlBufferShrink(in, toconv); |
| + out->use += written; |
| + out->content[out->use] = 0; |
| + if (ret == -1) ret = -3; |
| + } |
| +#endif /* LIBXML_ICU_ENABLED */ |
| #ifdef DEBUG_ENCODING |
| switch (ret) { |
| case 0: |
| @@ -1915,6 +2079,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, |
| ret = -3; |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + else if (handler->uconv_in != NULL) { |
| + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], |
| + &written, in->content, &toconv); |
| + xmlBufferShrink(in, toconv); |
| + out->use += written; |
| + out->content[out->use] = 0; |
| + if (ret == -1) |
| + ret = -3; |
| + } |
| +#endif /* LIBXML_ICU_ENABLED */ |
| switch (ret) { |
| case 0: |
| #ifdef DEBUG_ENCODING |
| @@ -2015,6 +2190,15 @@ retry: |
| out->content[out->use] = 0; |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + else if (handler->uconv_out != NULL) { |
| + ret = xmlUconvWrapper(handler->uconv_out, 0, |
| + &out->content[out->use], |
| + &written, NULL, &toconv); |
| + out->use += written; |
| + out->content[out->use] = 0; |
| + } |
| +#endif /* LIBXML_ICU_ENABLED */ |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "initialized encoder\n"); |
| @@ -2061,6 +2245,26 @@ retry: |
| } |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + else if (handler->uconv_out != NULL) { |
| + ret = xmlUconvWrapper(handler->uconv_out, 0, |
| + &out->content[out->use], |
| + &written, in->content, &toconv); |
| + xmlBufferShrink(in, toconv); |
| + out->use += written; |
| + writtentot += written; |
| + out->content[out->use] = 0; |
| + if (ret == -1) { |
| + if (written > 0) { |
| + /* |
| + * Can be a limitation of iconv |
| + */ |
| + goto retry; |
| + } |
| + ret = -3; |
| + } |
| + } |
| +#endif /* LIBXML_ICU_ENABLED */ |
| else { |
| xmlEncodingErr(XML_I18N_NO_OUTPUT, |
| "xmlCharEncOutFunc: no output function !\n", NULL); |
| @@ -2173,6 +2377,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { |
| xmlFree(handler); |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) { |
| + if (handler->name != NULL) |
| + xmlFree(handler->name); |
| + handler->name = NULL; |
| + if (handler->uconv_out != NULL) { |
| + closeIcuConverter(handler->uconv_out); |
| + handler->uconv_out = NULL; |
| + } |
| + if (handler->uconv_in != NULL) { |
| + closeIcuConverter(handler->uconv_in); |
| + handler->uconv_in = NULL; |
| + } |
| + xmlFree(handler); |
| + } |
| +#endif |
| #ifdef DEBUG_ENCODING |
| if (ret) |
| xmlGenericError(xmlGenericErrorContext, |
| @@ -2248,6 +2468,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { |
| cur += toconv; |
| } while (ret == -2); |
| #endif |
| +#ifdef LIBXML_ICU_ENABLED |
| + } else if (handler->uconv_out != NULL) { |
| + do { |
| + toconv = in->end - cur; |
| + written = 32000; |
| + ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0], |
| + &written, cur, &toconv); |
| + if (ret < 0) { |
| + if (written > 0) |
| + ret = -2; |
| + else |
| + return(-1); |
| + } |
| + unused += written; |
| + cur += toconv; |
| + } while (ret == -2); |
| } else { |
| /* could not find a converter */ |
| return(-1); |
| @@ -2259,8 +2495,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { |
| } |
| return(in->consumed + (in->cur - in->base)); |
| } |
| +#endif |
| |
| -#ifndef LIBXML_ICONV_ENABLED |
| +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) |
| #ifdef LIBXML_ISO8859X_ENABLED |
| |
| /** |
| diff --git a/third_party/libxml/include/libxml/encoding.h b/third_party/libxml/include/libxml/encoding.h |
| index c74b25f..b5f8b48 100644 |
| --- a/third_party/libxml/include/libxml/encoding.h |
| +++ b/third_party/libxml/include/libxml/encoding.h |
| @@ -26,6 +26,24 @@ |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| #include <iconv.h> |
| +#else |
| +#ifdef LIBXML_ICU_ENABLED |
| +#include <unicode/ucnv.h> |
| +#if 0 |
| +/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h> |
| + * to prevent unwanted ICU symbols being exposed to users of libxml2. |
| + * One particular case is Qt4 conflicting on UChar32. |
| + */ |
| +#include <stdint.h> |
| +struct UConverter; |
| +typedef struct UConverter UConverter; |
| +#ifdef _MSC_VER |
| +typedef wchar_t UChar; |
| +#else |
| +typedef uint16_t UChar; |
| +#endif |
| +#endif |
| +#endif |
| #endif |
| #ifdef __cplusplus |
| extern "C" { |
| @@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, |
| * Block defining the handlers for non UTF-8 encodings. |
| * If iconv is supported, there are two extra fields. |
| */ |
| +#ifdef LIBXML_ICU_ENABLED |
| +struct _uconv_t { |
| + UConverter *uconv; /* for conversion between an encoding and UTF-16 */ |
| + UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ |
| +}; |
| +typedef struct _uconv_t uconv_t; |
| +#endif |
| |
| typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; |
| typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; |
| @@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler { |
| iconv_t iconv_in; |
| iconv_t iconv_out; |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + uconv_t *uconv_in; |
| + uconv_t *uconv_out; |
| +#endif /* LIBXML_ICU_ENABLED */ |
| }; |
| |
| #ifdef __cplusplus |
| diff --git a/third_party/libxml/include/libxml/parser.h b/third_party/libxml/include/libxml/parser.h |
| index dd79c42..3580b63 100644 |
| --- a/third_party/libxml/include/libxml/parser.h |
| +++ b/third_party/libxml/include/libxml/parser.h |
| @@ -1222,6 +1222,7 @@ typedef enum { |
| XML_WITH_DEBUG_MEM = 29, |
| XML_WITH_DEBUG_RUN = 30, |
| XML_WITH_ZLIB = 31, |
| + XML_WITH_ICU = 32, |
| XML_WITH_NONE = 99999 /* just to be sure of allocation size */ |
| } xmlFeature; |
| |
| diff --git a/third_party/libxml/include/libxml/xmlversion.h.in b/third_party/libxml/include/libxml/xmlversion.h.in |
| index 4739f3a..de310ab 100644 |
| --- a/third_party/libxml/include/libxml/xmlversion.h.in |
| +++ b/third_party/libxml/include/libxml/xmlversion.h.in |
| @@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); |
| #endif |
| |
| /** |
| + * LIBXML_ICU_ENABLED: |
| + * |
| + * Whether icu support is available |
| + */ |
| +#if @WITH_ICU@ |
| +#define LIBXML_ICU_ENABLED |
| +#endif |
| + |
| +/** |
| * LIBXML_ISO8859X_ENABLED: |
| * |
| * Whether ISO-8859-* support is made available in case iconv is not |
| diff --git a/third_party/libxml/parser.c b/third_party/libxml/parser.c |
| index 85e7599..3ba2a06 100644 |
| --- a/third_party/libxml/parser.c |
| +++ b/third_party/libxml/parser.c |
| @@ -954,6 +954,12 @@ xmlHasFeature(xmlFeature feature) |
| #else |
| return(0); |
| #endif |
| + case XML_WITH_ICU: |
| +#ifdef LIBXML_ICU_ENABLED |
| + return(1); |
| +#else |
| + return(0); |
| +#endif |
| default: |
| break; |
| } |