From 4ad04ee227dd7d704f417aaf9d6762f5cfdf4c1f Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Tue, 29 Oct 2013 12:15:26 +0400 Subject: [PATCH] report all libxml errors in updater debug output force utf8 encoding if devforceupdate is on parser: try to convert non-unicode feeds with specified encoding to utf8 before trying to remove dangling utf8 characters in case of utf8-related libxml errors because doing so produces garbage content --- classes/feedparser.php | 29 +++++++++++++++++++---------- classes/feeds.php | 2 +- include/rssfuncs.php | 8 +++++++- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/classes/feedparser.php b/classes/feedparser.php index 1c97e496b..de6c56542 100644 --- a/classes/feedparser.php +++ b/classes/feedparser.php @@ -13,6 +13,16 @@ class FeedParser { const FEED_RSS = 1; const FEED_ATOM = 2; + function normalize_encoding($data) { + if (preg_match('/^(<\?xml[\t\n\r ].*?encoding[\t\n\r ]*=[\t\n\r ]*["\'])(.+?)(["\'].*?\?>)/s', $data, $matches) === 1) { + $data = mb_convert_encoding($data, 'UTF-8', $matches[2]); + + $data = preg_replace('/^<\?xml[\t\n\r ].*?\?>/s', $matches[1] . "UTF-8" . $matches[3] , $data); + } + + return $data; + } + function __construct($data) { libxml_use_internal_errors(true); libxml_clear_errors(); @@ -25,19 +35,15 @@ class FeedParser { // libxml compiled without iconv? if ($error && $error->code == 32) { - if (preg_match('/^(<\?xml[\t\n\r ].*?encoding[\t\n\r ]*=[\t\n\r ]*["\'])(.+?)(["\'].*?\?>)/s', $data, $matches) === 1) { - $data = mb_convert_encoding($data, 'UTF-8', $matches[2]); + $data = $this->normalize_encoding($data); - $data = preg_replace('/^<\?xml[\t\n\r ].*?\?>/s', $matches[1] . "UTF-8" . $matches[3] , $data); + if ($data) { + libxml_clear_errors(); - if ($data) { - libxml_clear_errors(); + $this->doc = new DOMDocument(); + $this->doc->loadXML($data); - $this->doc = new DOMDocument(); - $this->doc->loadXML($data); - - $error = libxml_get_last_error(); - } + $error = libxml_get_last_error(); } } @@ -45,6 +51,9 @@ class FeedParser { if ($error) { foreach (libxml_get_errors() as $err) { if ($err->code == 9) { + // if the source feed is not in utf8, next conversion will fail + $data = $this->normalize_encoding($data); + // remove dangling bytes $data = mb_convert_encoding($data, 'UTF-8', 'UTF-8'); diff --git a/classes/feeds.php b/classes/feeds.php index 7f5fd10af..2c17a2257 100644 --- a/classes/feeds.php +++ b/classes/feeds.php @@ -148,7 +148,7 @@ class Feeds extends Handler_Protected { $override_order = false, $include_children = false) { if (isset($_REQUEST["DevForceUpdate"])) - header("Content-Type: text/plain"); + header("Content-Type: text/plain; charset=utf-8"); $disable_cache = false; diff --git a/include/rssfuncs.php b/include/rssfuncs.php index bfbec0919..bc6048217 100644 --- a/include/rssfuncs.php +++ b/include/rssfuncs.php @@ -1110,7 +1110,13 @@ $error_msg = db_escape_string(mb_substr($rss->error(), 0, 245)); - _debug("error fetching feed: $error_msg", $debug_enabled); + _debug("fetch error: $error_msg", $debug_enabled); + + if (count($rss->errors()) > 1) { + foreach ($rss->errors() as $error) { + _debug("+ $error"); + } + } db_query( "UPDATE ttrss_feeds SET last_error = '$error_msg',