blob: 4503f922fcb32ede3fab33aa7d508253e87cdb28 [file] [log] [blame]
Add code support for ICU.
diff --git a/third_party/libxml/encoding.c b/third_party/libxml/encoding.c
index b86a547..0f41df9 100644
--- a/third_party/libxml/encoding.c
+++ b/third_party/libxml/encoding.c
@@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
static int xmlCharEncodingAliasesNb = 0;
static int xmlCharEncodingAliasesMax = 0;
-#ifdef LIBXML_ICONV_ENABLED
+#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
#if 0
#define DEBUG_ENCODING /* Define this to get encoding traces */
#endif
@@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val)
NULL, 0, val, NULL, NULL, 0, 0, msg, val);
}
+#ifdef LIBXML_ICU_ENABLED
+static uconv_t*
+openIcuConverter(const char* name, int toUnicode)
+{
+ UErrorCode status = U_ZERO_ERROR;
+ uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t));
+ if (conv == NULL)
+ return NULL;
+
+ conv->uconv = ucnv_open(name, &status);
+ if (U_FAILURE(status))
+ goto error;
+
+ status = U_ZERO_ERROR;
+ if (toUnicode) {
+ ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
+ NULL, NULL, NULL, &status);
+ }
+ else {
+ ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
+ NULL, NULL, NULL, &status);
+ }
+ if (U_FAILURE(status))
+ goto error;
+
+ status = U_ZERO_ERROR;
+ conv->utf8 = ucnv_open("UTF-8", &status);
+ if (U_SUCCESS(status))
+ return conv;
+
+error:
+ if (conv->uconv)
+ ucnv_close(conv->uconv);
+ xmlFree(conv);
+ return NULL;
+}
+
+static void
+closeIcuConverter(uconv_t *conv)
+{
+ if (conv != NULL) {
+ ucnv_close(conv->uconv);
+ ucnv_close(conv->utf8);
+ xmlFree(conv);
+ }
+}
+#endif /* LIBXML_ICU_ENABLED */
+
/************************************************************************
* *
* Conversions To/From UTF8 encoding *
@@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name,
#ifdef LIBXML_ICONV_ENABLED
handler->iconv_in = NULL;
handler->iconv_out = NULL;
-#endif /* LIBXML_ICONV_ENABLED */
+#endif
+#ifdef LIBXML_ICU_ENABLED
+ handler->uconv_in = NULL;
+ handler->uconv_out = NULL;
+#endif
/*
* registers and returns the handler.
@@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) {
xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL);
xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL);
#endif /* LIBXML_OUTPUT_ENABLED */
-#ifndef LIBXML_ICONV_ENABLED
+#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
#ifdef LIBXML_ISO8859X_ENABLED
xmlRegisterCharEncodingHandlersISO8859x ();
#endif
@@ -1578,6 +1630,10 @@ xmlFindCharEncodingHandler(const char *name) {
xmlCharEncodingHandlerPtr enc;
iconv_t icv_in, icv_out;
#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ xmlCharEncodingHandlerPtr enc;
+ uconv_t *ucv_in, *ucv_out;
+#endif /* LIBXML_ICU_ENABLED */
char upper[100];
int i;
@@ -1647,6 +1703,35 @@ xmlFindCharEncodingHandler(const char *name) {
"iconv : problems with filters for '%s'\n", name);
}
#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ /* check whether icu can handle this */
+ ucv_in = openIcuConverter(name, 1);
+ ucv_out = openIcuConverter(name, 0);
+ if (ucv_in != NULL && ucv_out != NULL) {
+ enc = (xmlCharEncodingHandlerPtr)
+ xmlMalloc(sizeof(xmlCharEncodingHandler));
+ if (enc == NULL) {
+ closeIcuConverter(ucv_in);
+ closeIcuConverter(ucv_out);
+ return(NULL);
+ }
+ enc->name = xmlMemStrdup(name);
+ enc->input = NULL;
+ enc->output = NULL;
+ enc->uconv_in = ucv_in;
+ enc->uconv_out = ucv_out;
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,
+ "Found ICU converter handler for encoding %s\n", name);
+#endif
+ return enc;
+ } else if (ucv_in != NULL || ucv_out != NULL) {
+ closeIcuConverter(ucv_in);
+ closeIcuConverter(ucv_out);
+ xmlEncodingErr(XML_ERR_INTERNAL_ERROR,
+ "ICU converter : problems with filters for '%s'\n", name);
+ }
+#endif /* LIBXML_ICU_ENABLED */
#ifdef DEBUG_ENCODING
xmlGenericError(xmlGenericErrorContext,
@@ -1737,6 +1822,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
/************************************************************************
* *
+ * ICU based generic conversion functions *
+ * *
+ ************************************************************************/
+
+#ifdef LIBXML_ICU_ENABLED
+/**
+ * xmlUconvWrapper:
+ * @cd: ICU uconverter data structure
+ * @toUnicode : non-zero if toUnicode. 0 otherwise.
+ * @out: a pointer to an array of bytes to store the result
+ * @outlen: the length of @out
+ * @in: a pointer to an array of ISO Latin 1 chars
+ * @inlen: the length of @in
+ *
+ * Returns 0 if success, or
+ * -1 by lack of space, or
+ * -2 if the transcoding fails (for *in is not valid utf8 string or
+ * the result of transformation can't fit into the encoding we want), or
+ * -3 if there the last byte can't form a single output char.
+ *
+ * The value of @inlen after return is the number of octets consumed
+ * as the return value is positive, else unpredictable.
+ * The value of @outlen after return is the number of ocetes consumed.
+ */
+static int
+xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
+ const unsigned char *in, int *inlen) {
+ const char *ucv_in = (const char *) in;
+ char *ucv_out = (char *) out;
+ UErrorCode err = U_ZERO_ERROR;
+
+ if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
+ if (outlen != NULL) *outlen = 0;
+ return(-1);
+ }
+
+ /*
+ * TODO(jungshik)
+ * 1. is ucnv_convert(To|From)Algorithmic better?
+ * 2. had we better use an explicit pivot buffer?
+ * 3. error returned comes from 'fromUnicode' only even
+ * when toUnicode is true !
+ */
+ if (toUnicode) {
+ /* encoding => UTF-16 => UTF-8 */
+ ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
+ &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
+ 0, TRUE, &err);
+ } else {
+ /* UTF-8 => UTF-16 => encoding */
+ ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
+ &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
+ 0, TRUE, &err);
+ }
+ *inlen = ucv_in - (const char*) in;
+ *outlen = ucv_out - (char *) out;
+ if (U_SUCCESS(err))
+ return 0;
+ if (err == U_BUFFER_OVERFLOW_ERROR)
+ return -1;
+ if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
+ return -2;
+ /* if (err == U_TRUNCATED_CHAR_FOUND) */
+ return -3;
+}
+#endif /* LIBXML_ICU_ENABLED */
+
+/************************************************************************
+ * *
* The real API used by libxml for on-the-fly conversion *
* *
************************************************************************/
@@ -1810,6 +1964,16 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
if (ret == -1) ret = -3;
}
#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ else if (handler->uconv_in != NULL) {
+ ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
+ &written, in->content, &toconv);
+ xmlBufferShrink(in, toconv);
+ out->use += written;
+ out->content[out->use] = 0;
+ if (ret == -1) ret = -3;
+ }
+#endif /* LIBXML_ICU_ENABLED */
#ifdef DEBUG_ENCODING
switch (ret) {
case 0:
@@ -1915,6 +2079,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
ret = -3;
}
#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ else if (handler->uconv_in != NULL) {
+ ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
+ &written, in->content, &toconv);
+ xmlBufferShrink(in, toconv);
+ out->use += written;
+ out->content[out->use] = 0;
+ if (ret == -1)
+ ret = -3;
+ }
+#endif /* LIBXML_ICU_ENABLED */
switch (ret) {
case 0:
#ifdef DEBUG_ENCODING
@@ -2015,6 +2190,15 @@ retry:
out->content[out->use] = 0;
}
#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ else if (handler->uconv_out != NULL) {
+ ret = xmlUconvWrapper(handler->uconv_out, 0,
+ &out->content[out->use],
+ &written, NULL, &toconv);
+ out->use += written;
+ out->content[out->use] = 0;
+ }
+#endif /* LIBXML_ICU_ENABLED */
#ifdef DEBUG_ENCODING
xmlGenericError(xmlGenericErrorContext,
"initialized encoder\n");
@@ -2061,6 +2245,26 @@ retry:
}
}
#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ else if (handler->uconv_out != NULL) {
+ ret = xmlUconvWrapper(handler->uconv_out, 0,
+ &out->content[out->use],
+ &written, in->content, &toconv);
+ xmlBufferShrink(in, toconv);
+ out->use += written;
+ writtentot += written;
+ out->content[out->use] = 0;
+ if (ret == -1) {
+ if (written > 0) {
+ /*
+ * Can be a limitation of iconv
+ */
+ goto retry;
+ }
+ ret = -3;
+ }
+ }
+#endif /* LIBXML_ICU_ENABLED */
else {
xmlEncodingErr(XML_I18N_NO_OUTPUT,
"xmlCharEncOutFunc: no output function !\n", NULL);
@@ -2173,6 +2377,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
xmlFree(handler);
}
#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) {
+ if (handler->name != NULL)
+ xmlFree(handler->name);
+ handler->name = NULL;
+ if (handler->uconv_out != NULL) {
+ closeIcuConverter(handler->uconv_out);
+ handler->uconv_out = NULL;
+ }
+ if (handler->uconv_in != NULL) {
+ closeIcuConverter(handler->uconv_in);
+ handler->uconv_in = NULL;
+ }
+ xmlFree(handler);
+ }
+#endif
#ifdef DEBUG_ENCODING
if (ret)
xmlGenericError(xmlGenericErrorContext,
@@ -2248,6 +2468,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
cur += toconv;
} while (ret == -2);
#endif
+#ifdef LIBXML_ICU_ENABLED
+ } else if (handler->uconv_out != NULL) {
+ do {
+ toconv = in->end - cur;
+ written = 32000;
+ ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0],
+ &written, cur, &toconv);
+ if (ret < 0) {
+ if (written > 0)
+ ret = -2;
+ else
+ return(-1);
+ }
+ unused += written;
+ cur += toconv;
+ } while (ret == -2);
} else {
/* could not find a converter */
return(-1);
@@ -2259,8 +2495,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
}
return(in->consumed + (in->cur - in->base));
}
+#endif
-#ifndef LIBXML_ICONV_ENABLED
+#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
#ifdef LIBXML_ISO8859X_ENABLED
/**
diff --git a/third_party/libxml/include/libxml/encoding.h b/third_party/libxml/include/libxml/encoding.h
index c74b25f..b5f8b48 100644
--- a/third_party/libxml/include/libxml/encoding.h
+++ b/third_party/libxml/include/libxml/encoding.h
@@ -26,6 +26,24 @@
#ifdef LIBXML_ICONV_ENABLED
#include <iconv.h>
+#else
+#ifdef LIBXML_ICU_ENABLED
+#include <unicode/ucnv.h>
+#if 0
+/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h>
+ * to prevent unwanted ICU symbols being exposed to users of libxml2.
+ * One particular case is Qt4 conflicting on UChar32.
+ */
+#include <stdint.h>
+struct UConverter;
+typedef struct UConverter UConverter;
+#ifdef _MSC_VER
+typedef wchar_t UChar;
+#else
+typedef uint16_t UChar;
+#endif
+#endif
+#endif
#endif
#ifdef __cplusplus
extern "C" {
@@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
* Block defining the handlers for non UTF-8 encodings.
* If iconv is supported, there are two extra fields.
*/
+#ifdef LIBXML_ICU_ENABLED
+struct _uconv_t {
+ UConverter *uconv; /* for conversion between an encoding and UTF-16 */
+ UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
+};
+typedef struct _uconv_t uconv_t;
+#endif
typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr;
@@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler {
iconv_t iconv_in;
iconv_t iconv_out;
#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ uconv_t *uconv_in;
+ uconv_t *uconv_out;
+#endif /* LIBXML_ICU_ENABLED */
};
#ifdef __cplusplus
diff --git a/third_party/libxml/include/libxml/parser.h b/third_party/libxml/include/libxml/parser.h
index dd79c42..3580b63 100644
--- a/third_party/libxml/include/libxml/parser.h
+++ b/third_party/libxml/include/libxml/parser.h
@@ -1222,6 +1222,7 @@ typedef enum {
XML_WITH_DEBUG_MEM = 29,
XML_WITH_DEBUG_RUN = 30,
XML_WITH_ZLIB = 31,
+ XML_WITH_ICU = 32,
XML_WITH_NONE = 99999 /* just to be sure of allocation size */
} xmlFeature;
diff --git a/third_party/libxml/include/libxml/xmlversion.h.in b/third_party/libxml/include/libxml/xmlversion.h.in
index 4739f3a..de310ab 100644
--- a/third_party/libxml/include/libxml/xmlversion.h.in
+++ b/third_party/libxml/include/libxml/xmlversion.h.in
@@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
#endif
/**
+ * LIBXML_ICU_ENABLED:
+ *
+ * Whether icu support is available
+ */
+#if @WITH_ICU@
+#define LIBXML_ICU_ENABLED
+#endif
+
+/**
* LIBXML_ISO8859X_ENABLED:
*
* Whether ISO-8859-* support is made available in case iconv is not
diff --git a/third_party/libxml/parser.c b/third_party/libxml/parser.c
index 85e7599..3ba2a06 100644
--- a/third_party/libxml/parser.c
+++ b/third_party/libxml/parser.c
@@ -954,6 +954,12 @@ xmlHasFeature(xmlFeature feature)
#else
return(0);
#endif
+ case XML_WITH_ICU:
+#ifdef LIBXML_ICU_ENABLED
+ return(1);
+#else
+ return(0);
+#endif
default:
break;
}