domdocument: remove old meta charset unicode hacks, replace with shorter xml preamble utf8 hack (on loadhtml where it makes sense)
af_readability: better (?) charset hack for non-unicode pages
This commit is contained in:
parent
3bd3324e5a
commit
671f4cee65
|
@ -329,7 +329,7 @@ class Handler_Public extends Handler {
|
||||||
if (!$og_image) {
|
if (!$og_image) {
|
||||||
$tmpdoc = new DOMDocument();
|
$tmpdoc = new DOMDocument();
|
||||||
|
|
||||||
if (@$tmpdoc->loadHTML(mb_substr($content, 0, 131070))) {
|
if (@$tmpdoc->loadHTML('<?xml encoding="UTF-8">' . mb_substr($content, 0, 131070))) {
|
||||||
$tmpxpath = new DOMXPath($tmpdoc);
|
$tmpxpath = new DOMXPath($tmpdoc);
|
||||||
$imgs = $tmpxpath->query("//img");
|
$imgs = $tmpxpath->query("//img");
|
||||||
|
|
||||||
|
|
|
@ -1200,12 +1200,8 @@ class RSSUtils {
|
||||||
static function cache_media($html, $site_url) {
|
static function cache_media($html, $site_url) {
|
||||||
libxml_use_internal_errors(true);
|
libxml_use_internal_errors(true);
|
||||||
|
|
||||||
$charset_hack = '<head>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
|
||||||
</head>';
|
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
$doc = new DOMDocument();
|
||||||
$doc->loadHTML($charset_hack . $html);
|
$doc->loadHTML('<?xml encoding="UTF-8">' . $html);
|
||||||
$xpath = new DOMXPath($doc);
|
$xpath = new DOMXPath($doc);
|
||||||
|
|
||||||
$entries = $xpath->query('(//img[@src])|(//video/source[@src])|(//audio/source[@src])');
|
$entries = $xpath->query('(//img[@src])|(//video/source[@src])|(//audio/source[@src])');
|
||||||
|
|
|
@ -562,7 +562,7 @@
|
||||||
libxml_use_internal_errors(true);
|
libxml_use_internal_errors(true);
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
$doc = new DOMDocument();
|
||||||
$doc->loadHTML($html);
|
$doc->loadHTML('<?xml encoding="UTF-8">' . $html);
|
||||||
$xpath = new DOMXPath($doc);
|
$xpath = new DOMXPath($doc);
|
||||||
|
|
||||||
$base = $xpath->query('/html/head/base[@href]');
|
$base = $xpath->query('/html/head/base[@href]');
|
||||||
|
@ -1518,14 +1518,10 @@
|
||||||
// plugins work on original source URLs used before caching
|
// plugins work on original source URLs used before caching
|
||||||
|
|
||||||
function rewrite_cached_urls($str) {
|
function rewrite_cached_urls($str) {
|
||||||
$charset_hack = '<head>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
|
||||||
</head>';
|
|
||||||
|
|
||||||
$res = trim($str); if (!$res) return '';
|
$res = trim($str); if (!$res) return '';
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
$doc = new DOMDocument();
|
||||||
$doc->loadHTML($charset_hack . $res);
|
$doc->loadHTML('<?xml encoding="UTF-8">' . $res);
|
||||||
$xpath = new DOMXPath($doc);
|
$xpath = new DOMXPath($doc);
|
||||||
|
|
||||||
$entries = $xpath->query('(//img[@src]|//picture/source[@src]|//video[@poster]|//video/source[@src]|//audio/source[@src])');
|
$entries = $xpath->query('(//img[@src]|//picture/source[@src]|//video[@poster]|//video/source[@src]|//audio/source[@src])');
|
||||||
|
@ -1580,16 +1576,10 @@
|
||||||
|
|
||||||
$res = trim($str); if (!$res) return '';
|
$res = trim($str); if (!$res) return '';
|
||||||
|
|
||||||
$charset_hack = '<head>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
|
||||||
</head>';
|
|
||||||
|
|
||||||
$res = trim($res); if (!$res) return '';
|
|
||||||
|
|
||||||
libxml_use_internal_errors(true);
|
libxml_use_internal_errors(true);
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
$doc = new DOMDocument();
|
||||||
$doc->loadHTML($charset_hack . $res);
|
$doc->loadHTML('<?xml encoding="UTF-8">' . $res);
|
||||||
$xpath = new DOMXPath($doc);
|
$xpath = new DOMXPath($doc);
|
||||||
|
|
||||||
$rewrite_base_url = $site_url ? $site_url : get_self_url_prefix();
|
$rewrite_base_url = $site_url ? $site_url : get_self_url_prefix();
|
||||||
|
@ -2115,7 +2105,7 @@
|
||||||
libxml_use_internal_errors(true);
|
libxml_use_internal_errors(true);
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
$doc = new DOMDocument();
|
||||||
$doc->loadHTML($content);
|
$doc->loadHTML('<?xml encoding="UTF-8">' . $content);
|
||||||
$xpath = new DOMXPath($doc);
|
$xpath = new DOMXPath($doc);
|
||||||
$entries = $xpath->query('/html/head/link[@rel="alternate" and '.
|
$entries = $xpath->query('/html/head/link[@rel="alternate" and '.
|
||||||
'(contains(@type,"rss") or contains(@type,"atom"))]|/html/head/link[@rel="feed"]');
|
'(contains(@type,"rss") or contains(@type,"atom"))]|/html/head/link[@rel="feed"]');
|
||||||
|
@ -2136,7 +2126,7 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
function is_html($content) {
|
function is_html($content) {
|
||||||
return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 100)) !== 0;
|
return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 8192)) !== 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
function url_is_html($url, $login = false, $pass = false) {
|
function url_is_html($url, $login = false, $pass = false) {
|
||||||
|
|
|
@ -19,11 +19,7 @@ class Af_Fsckportal extends Plugin {
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
$doc = new DOMDocument();
|
||||||
|
|
||||||
$charset_hack = '<head>
|
@$doc->loadHTML('<?xml encoding="UTF-8">' . $article["content"]);
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
|
||||||
</head>';
|
|
||||||
|
|
||||||
@$doc->loadHTML($charset_hack . $article["content"]);
|
|
||||||
|
|
||||||
if ($doc) {
|
if ($doc) {
|
||||||
$xpath = new DOMXPath($doc);
|
$xpath = new DOMXPath($doc);
|
||||||
|
|
|
@ -172,14 +172,10 @@ class Af_Readability extends Plugin {
|
||||||
if (!$tmpdoc->loadHTML($tmp))
|
if (!$tmpdoc->loadHTML($tmp))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
// this is the worst hack yet :(
|
||||||
if (strtolower($tmpdoc->encoding) != 'utf-8') {
|
if (strtolower($tmpdoc->encoding) != 'utf-8') {
|
||||||
$tmpxpath = new DOMXPath($tmpdoc);
|
$tmp = preg_replace("/<meta.*?charset.*?\/>/i", "", $tmp);
|
||||||
|
$tmp = mb_convert_encoding($tmp, 'utf-8', $tmpdoc->encoding);
|
||||||
foreach ($tmpxpath->query("//meta") as $elem) {
|
|
||||||
$elem->parentNode->removeChild($elem);
|
|
||||||
}
|
|
||||||
|
|
||||||
$tmp = $tmpdoc->saveHTML();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -210,7 +206,6 @@ class Af_Readability extends Plugin {
|
||||||
} catch (Exception $e) {
|
} catch (Exception $e) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -25,12 +25,8 @@ class Af_Tumblr_1280 extends Plugin {
|
||||||
if (!function_exists("curl_init") || ini_get("open_basedir"))
|
if (!function_exists("curl_init") || ini_get("open_basedir"))
|
||||||
return $article;
|
return $article;
|
||||||
|
|
||||||
$charset_hack = '<head>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
|
||||||
</head>';
|
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
$doc = new DOMDocument();
|
||||||
$doc->loadHTML($charset_hack . $article["content"]);
|
$doc->loadHTML('<?xml encoding="UTF-8">' . $article["content"]);
|
||||||
|
|
||||||
$found = false;
|
$found = false;
|
||||||
|
|
||||||
|
@ -92,4 +88,4 @@ class Af_Tumblr_1280 extends Plugin {
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -155,7 +155,7 @@ class Af_Zz_ImgProxy extends Plugin {
|
||||||
$proxy_all = $this->host->get($this, "proxy_all");
|
$proxy_all = $this->host->get($this, "proxy_all");
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
$doc = new DOMDocument();
|
||||||
if (@$doc->loadHTML($article["content"])) {
|
if (@$doc->loadHTML('<?xml encoding="UTF-8">' . $article["content"])) {
|
||||||
$xpath = new DOMXPath($doc);
|
$xpath = new DOMXPath($doc);
|
||||||
$imgs = $xpath->query("//img[@src]");
|
$imgs = $xpath->query("//img[@src]");
|
||||||
|
|
||||||
|
|
|
@ -190,12 +190,8 @@ class Cache_Starred_Images extends Plugin implements IHandler {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
$charset_hack = '<head>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
|
||||||
</head>';
|
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
$doc = new DOMDocument();
|
||||||
$doc->loadHTML($charset_hack . $content);
|
$doc->loadHTML('<?xml encoding="UTF-8">' . $content);
|
||||||
$xpath = new DOMXPath($doc);
|
$xpath = new DOMXPath($doc);
|
||||||
|
|
||||||
$entries = $xpath->query('(//img[@src])|(//video/source[@src])');
|
$entries = $xpath->query('(//img[@src])|(//video/source[@src])');
|
||||||
|
|
Loading…
Reference in New Issue