From d4da4dcc321ca65fb2cd19877f395cc5f75933ab Mon Sep 17 00:00:00 2001 From: Chih-Hsuan Yen <645432-yan12125@users.noreply.gitlab.com> Date: Sun, 26 Nov 2023 20:53:05 +0800 Subject: [PATCH] Fix sanitizer with libxml2 >= 2.12.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Somehow with newer libxml2, `` no longer enforces UTF-8. Instead, non-ASCII contents are treated as ISO-8859-1 and get broken. For example, `

中文

` becomes `

中文

` (should be `

中文

`). Switching to another trick mentioned on [1] fixes the issue, and the new trick still works with older libxml2 (tested 2.11.5). As a side note, DOMDocument::loadHTML uses HTMLParser in libxml2 [2][3]. [1] https://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly [2] https://github.com/php/php-src/blob/php-8.1.26/ext/dom/document.c#L1855 [3] https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-HTMLparser.html --- classes/Sanitizer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/classes/Sanitizer.php b/classes/Sanitizer.php index a7bea9e5f..7af92f249 100644 --- a/classes/Sanitizer.php +++ b/classes/Sanitizer.php @@ -72,7 +72,7 @@ class Sanitizer { $res = trim($str); if (!$res) return ''; $doc = new DOMDocument(); - $doc->loadHTML('' . $res); + $doc->loadHTML('' . $res); $xpath = new DOMXPath($doc); // is it a good idea to possibly rewrite urls to our own prefix?