diff --git a/trunk/PKGBUILD b/trunk/PKGBUILD index ebe24f8..4210a34 100644 --- a/trunk/PKGBUILD +++ b/trunk/PKGBUILD @@ -15,9 +15,15 @@ makedepends=(python2 python git) _commit=b48e77cf4f6fa0792c5f4b639707a2b0675e461b # tags/v2.9.12^0 source=("git+https://gitlab.gnome.org/GNOME/libxml2.git#commit=$_commit" libxml2-2.9.8-python3-unicode-errors.patch + libxml2-2.9.12-fix-lxml-corrupted-tree.patch + libxml2-2.9.12-fix-lxml-compatibility.patch + libxml2-2.9.12-fix-whitespace-when-serializing-empty-html.patch https://www.w3.org/XML/Test/xmlts20130923.tar.gz) sha256sums=('SKIP' '37eb81a8ec6929eed1514e891bff2dd05b450bcf0c712153880c485b7366c17c' + '5b64aa7f2411eb7650491bf42e4a3531fc3158257b4745956975b9b9bc605f7d' + 'a0bb996bc73a13f3c3758f74d0a4198757e703710c5e3e462fd7a8e58a561563' + '74aa35b58890355411af5fb7dbb9deaedd6dff6d0f4cb35dd1196b91dd68a410' '9b61db9f5dbffa545f4b8d78422167083a8568c59bd1129f94138f936cf6fc1f') pkgver() { diff --git a/trunk/libxml2-2.9.12-fix-formatting-regression.patch b/trunk/libxml2-2.9.12-fix-formatting-regression.patch new file mode 100644 index 0000000..ffbc850 --- /dev/null +++ b/trunk/libxml2-2.9.12-fix-formatting-regression.patch @@ -0,0 +1,46 @@ +From 13ad8736d294536da4cbcd70a96b0a2fbf47070c Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 25 May 2021 10:55:25 +0200 +Subject: [PATCH] Fix regression in xmlNodeDumpOutputInternal + +Commit 85b1792e could cause additional whitespace if xmlNodeDump was +called with a non-zero starting level. +--- + xmlsave.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/xmlsave.c b/xmlsave.c +index aedbd5e7..489505f4 100644 +--- a/xmlsave.c ++++ b/xmlsave.c +@@ -890,6 +890,13 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + + case XML_ELEMENT_NODE: ++ if ((cur != root) && (ctxt->format == 1) && ++ (xmlIndentTreeOutput)) ++ xmlOutputBufferWrite(buf, ctxt->indent_size * ++ (ctxt->level > ctxt->indent_nr ? ++ ctxt->indent_nr : ctxt->level), ++ ctxt->indent); ++ + /* + * Some users like lxml are known to pass nodes with a corrupted + * tree structure. Fall back to a recursive call to handle this +@@ -900,13 +907,6 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + } + +- if ((ctxt->level > 0) && (ctxt->format == 1) && +- (xmlIndentTreeOutput)) +- xmlOutputBufferWrite(buf, ctxt->indent_size * +- (ctxt->level > ctxt->indent_nr ? +- ctxt->indent_nr : ctxt->level), +- ctxt->indent); +- + xmlOutputBufferWrite(buf, 1, "<"); + if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { + xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); +-- +GitLab + diff --git a/trunk/libxml2-2.9.12-fix-lxml-compatibility.patch b/trunk/libxml2-2.9.12-fix-lxml-compatibility.patch new file mode 100644 index 0000000..ffbc850 --- /dev/null +++ b/trunk/libxml2-2.9.12-fix-lxml-compatibility.patch @@ -0,0 +1,46 @@ +From 13ad8736d294536da4cbcd70a96b0a2fbf47070c Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 25 May 2021 10:55:25 +0200 +Subject: [PATCH] Fix regression in xmlNodeDumpOutputInternal + +Commit 85b1792e could cause additional whitespace if xmlNodeDump was +called with a non-zero starting level. +--- + xmlsave.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/xmlsave.c b/xmlsave.c +index aedbd5e7..489505f4 100644 +--- a/xmlsave.c ++++ b/xmlsave.c +@@ -890,6 +890,13 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + + case XML_ELEMENT_NODE: ++ if ((cur != root) && (ctxt->format == 1) && ++ (xmlIndentTreeOutput)) ++ xmlOutputBufferWrite(buf, ctxt->indent_size * ++ (ctxt->level > ctxt->indent_nr ? ++ ctxt->indent_nr : ctxt->level), ++ ctxt->indent); ++ + /* + * Some users like lxml are known to pass nodes with a corrupted + * tree structure. Fall back to a recursive call to handle this +@@ -900,13 +907,6 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + } + +- if ((ctxt->level > 0) && (ctxt->format == 1) && +- (xmlIndentTreeOutput)) +- xmlOutputBufferWrite(buf, ctxt->indent_size * +- (ctxt->level > ctxt->indent_nr ? +- ctxt->indent_nr : ctxt->level), +- ctxt->indent); +- + xmlOutputBufferWrite(buf, 1, "<"); + if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { + xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); +-- +GitLab + diff --git a/trunk/libxml2-2.9.12-fix-lxml-corrupted-tree.patch b/trunk/libxml2-2.9.12-fix-lxml-corrupted-tree.patch new file mode 100644 index 0000000..482b9f0 --- /dev/null +++ b/trunk/libxml2-2.9.12-fix-lxml-corrupted-tree.patch @@ -0,0 +1,211 @@ +From 85b1792e37b131e7a51af98a37f92472e8de5f3f Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 18 May 2021 20:08:28 +0200 +Subject: [PATCH] Work around lxml API abuse + +Make xmlNodeDumpOutput and htmlNodeDumpFormatOutput work with corrupted +parent pointers. This used to work with the old recursive code but the +non-recursive rewrite required parent pointers to be set correctly. + +Unfortunately, lxml relies on the old behavior and passes subtrees with +a corrupted structure. Fall back to a recursive function call if an +invalid parent pointer is detected. + +Fixes #255. +--- + HTMLtree.c | 46 ++++++++++++++++++++++++++++------------------ + xmlsave.c | 31 +++++++++++++++++++++---------- + 2 files changed, 49 insertions(+), 28 deletions(-) + +diff --git a/HTMLtree.c b/HTMLtree.c +index 24434d45..bdd639c7 100644 +--- a/HTMLtree.c ++++ b/HTMLtree.c +@@ -744,7 +744,7 @@ void + htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, + int format) { +- xmlNodePtr root; ++ xmlNodePtr root, parent; + xmlAttrPtr attr; + const htmlElemDesc * info; + +@@ -755,6 +755,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + } + + root = cur; ++ parent = cur->parent; + while (1) { + switch (cur->type) { + case XML_HTML_DOCUMENT_NODE: +@@ -762,13 +763,25 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + if (((xmlDocPtr) cur)->intSubset != NULL) { + htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); + } +- if (cur->children != NULL) { ++ /* Always validate cur->parent when descending. */ ++ if ((cur->parent == parent) && (cur->children != NULL)) { ++ parent = cur; + cur = cur->children; + continue; + } + break; + + case XML_ELEMENT_NODE: ++ /* ++ * Some users like lxml are known to pass nodes with a corrupted ++ * tree structure. Fall back to a recursive call to handle this ++ * case. ++ */ ++ if ((cur->parent != parent) && (cur->children != NULL)) { ++ htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); ++ break; ++ } ++ + /* + * Get specific HTML info for that node. + */ +@@ -817,6 +830,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + (cur->name != NULL) && + (cur->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); ++ parent = cur; + cur = cur->children; + continue; + } +@@ -825,9 +839,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + (info != NULL) && (!info->isinline)) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE) && +- (cur->parent != NULL) && +- (cur->parent->name != NULL) && +- (cur->parent->name[0] != 'p')) /* p, pre, param */ ++ (parent != NULL) && ++ (parent->name != NULL) && ++ (parent->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); + } + +@@ -842,9 +856,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + break; + if (((cur->name == (const xmlChar *)xmlStringText) || + (cur->name != (const xmlChar *)xmlStringTextNoenc)) && +- ((cur->parent == NULL) || +- ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && +- (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { ++ ((parent == NULL) || ++ ((xmlStrcasecmp(parent->name, BAD_CAST "script")) && ++ (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) { + xmlChar *buffer; + + buffer = xmlEncodeEntitiesReentrant(doc, cur->content); +@@ -902,13 +916,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + break; + } + +- /* +- * The parent should never be NULL here but we want to handle +- * corrupted documents gracefully. +- */ +- if (cur->parent == NULL) +- return; +- cur = cur->parent; ++ cur = parent; ++ /* cur->parent was validated when descending. */ ++ parent = cur->parent; + + if ((cur->type == XML_HTML_DOCUMENT_NODE) || + (cur->type == XML_DOCUMENT_NODE)) { +@@ -939,9 +949,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + (cur->next != NULL)) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE) && +- (cur->parent != NULL) && +- (cur->parent->name != NULL) && +- (cur->parent->name[0] != 'p')) /* p, pre, param */ ++ (parent != NULL) && ++ (parent->name != NULL) && ++ (parent->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); + } + } +diff --git a/xmlsave.c b/xmlsave.c +index 61a40459..aedbd5e7 100644 +--- a/xmlsave.c ++++ b/xmlsave.c +@@ -847,7 +847,7 @@ htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + static void + xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + int format = ctxt->format; +- xmlNodePtr tmp, root, unformattedNode = NULL; ++ xmlNodePtr tmp, root, unformattedNode = NULL, parent; + xmlAttrPtr attr; + xmlChar *start, *end; + xmlOutputBufferPtr buf; +@@ -856,6 +856,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + buf = ctxt->buf; + + root = cur; ++ parent = cur->parent; + while (1) { + switch (cur->type) { + case XML_DOCUMENT_NODE: +@@ -868,7 +869,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + + case XML_DOCUMENT_FRAG_NODE: +- if (cur->children != NULL) { ++ /* Always validate cur->parent when descending. */ ++ if ((cur->parent == parent) && (cur->children != NULL)) { ++ parent = cur; + cur = cur->children; + continue; + } +@@ -887,7 +890,18 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + + case XML_ELEMENT_NODE: +- if ((cur != root) && (ctxt->format == 1) && (xmlIndentTreeOutput)) ++ /* ++ * Some users like lxml are known to pass nodes with a corrupted ++ * tree structure. Fall back to a recursive call to handle this ++ * case. ++ */ ++ if ((cur->parent != parent) && (cur->children != NULL)) { ++ xmlNodeDumpOutputInternal(ctxt, cur); ++ break; ++ } ++ ++ if ((ctxt->level > 0) && (ctxt->format == 1) && ++ (xmlIndentTreeOutput)) + xmlOutputBufferWrite(buf, ctxt->indent_size * + (ctxt->level > ctxt->indent_nr ? + ctxt->indent_nr : ctxt->level), +@@ -942,6 +956,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + xmlOutputBufferWrite(buf, 1, ">"); + if (ctxt->format == 1) xmlOutputBufferWrite(buf, 1, "\n"); + if (ctxt->level >= 0) ctxt->level++; ++ parent = cur; + cur = cur->children; + continue; + } +@@ -1058,13 +1073,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + } + +- /* +- * The parent should never be NULL here but we want to handle +- * corrupted documents gracefully. +- */ +- if (cur->parent == NULL) +- return; +- cur = cur->parent; ++ cur = parent; ++ /* cur->parent was validated when descending. */ ++ parent = cur->parent; + + if (cur->type == XML_ELEMENT_NODE) { + if (ctxt->level > 0) ctxt->level--; +-- +GitLab + diff --git a/trunk/libxml2-2.9.12-fix-whitespace-when-serializing-empty-html.patch b/trunk/libxml2-2.9.12-fix-whitespace-when-serializing-empty-html.patch new file mode 100644 index 0000000..81fc243 --- /dev/null +++ b/trunk/libxml2-2.9.12-fix-whitespace-when-serializing-empty-html.patch @@ -0,0 +1,43 @@ +From 92d9ab4c28842a09ca2b76d3ff2f933e01b6cd6f Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Mon, 7 Jun 2021 15:09:53 +0200 +Subject: [PATCH] Fix whitespace when serializing empty HTML documents + +The old, non-recursive HTML serialization code would always terminate +the output with a newline. The new implementation omitted the newline +if the document node had no children. Readd the newline when +serializing empty documents. + +Fixes #266. +--- + HTMLtree.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/HTMLtree.c b/HTMLtree.c +index bdd639c7..7a2b8558 100644 +--- a/HTMLtree.c ++++ b/HTMLtree.c +@@ -763,11 +763,15 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + if (((xmlDocPtr) cur)->intSubset != NULL) { + htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); + } +- /* Always validate cur->parent when descending. */ +- if ((cur->parent == parent) && (cur->children != NULL)) { +- parent = cur; +- cur = cur->children; +- continue; ++ if (cur->children != NULL) { ++ /* Always validate cur->parent when descending. */ ++ if (cur->parent == parent) { ++ parent = cur; ++ cur = cur->children; ++ continue; ++ } ++ } else { ++ xmlOutputBufferWriteString(buf, "\n"); + } + break; + +-- +GitLab +