UrlHelper::rewrite_relative():

- support invoking specifying owner URL element/attribute
 - restrict mailto/magnet/tel schemes for A href
 - allow some data: base64 image types for IMG src

Sanitizer::sanitize():

 - when checking href and src attributes, pass element tagname and attribute to rewrite_relative()
This commit is contained in:
Andrew Dolgov 2021-06-18 11:20:57 +03:00
parent 34807bacd4
commit e9c062a189
2 changed files with 20 additions and 7 deletions

View File

@ -74,7 +74,7 @@ class Sanitizer {
if ($entry->hasAttribute('href')) { if ($entry->hasAttribute('href')) {
$entry->setAttribute('href', $entry->setAttribute('href',
rewrite_relative_url($rewrite_base_url, $entry->getAttribute('href'))); UrlHelper::rewrite_relative($rewrite_base_url, $entry->getAttribute('href'), $entry->tagName, "href"));
$entry->setAttribute('rel', 'noopener noreferrer'); $entry->setAttribute('rel', 'noopener noreferrer');
$entry->setAttribute("target", "_blank"); $entry->setAttribute("target", "_blank");
@ -82,7 +82,7 @@ class Sanitizer {
if ($entry->hasAttribute('src')) { if ($entry->hasAttribute('src')) {
$entry->setAttribute('src', $entry->setAttribute('src',
rewrite_relative_url($rewrite_base_url, $entry->getAttribute('src'))); UrlHelper::rewrite_relative($rewrite_base_url, $entry->getAttribute('src'), $entry->tagName, "src"));
} }
if ($entry->nodeName == 'img') { if ($entry->nodeName == 'img') {
@ -94,7 +94,7 @@ class Sanitizer {
$matches = RSSUtils::decode_srcset($entry->getAttribute('srcset')); $matches = RSSUtils::decode_srcset($entry->getAttribute('srcset'));
for ($i = 0; $i < count($matches); $i++) { for ($i = 0; $i < count($matches); $i++) {
$matches[$i]["url"] = rewrite_relative_url($rewrite_base_url, $matches[$i]["url"]); $matches[$i]["url"] = UrlHelper::rewrite_relative($rewrite_base_url, $matches[$i]["url"]);
} }
$entry->setAttribute("srcset", RSSUtils::encode_srcset($matches)); $entry->setAttribute("srcset", RSSUtils::encode_srcset($matches));

View File

@ -1,6 +1,6 @@
<?php <?php
class UrlHelper { class UrlHelper {
const ALLOWED_RELATIVE_SCHEMES = [ const EXTRA_HREF_SCHEMES = [
"magnet", "magnet",
"mailto", "mailto",
"tel" "tel"
@ -27,22 +27,35 @@ class UrlHelper {
/** /**
* Converts a (possibly) relative URL to a absolute one, using provided base URL. * Converts a (possibly) relative URL to a absolute one, using provided base URL.
* Provides some exceptions for additional schemes like data: if called with owning element/attribute.
* *
* @param string $base_url Base URL (i.e. from where the document is) * @param string $base_url Base URL (i.e. from where the document is)
* @param string $rel_url Possibly relative URL in the document * @param string $rel_url Possibly relative URL in the document
* @param string $owner_element Owner node tag name (i.e. A) (optional)
* @param string $owner_attribute Owner attribute (i.e. href) (optional)
* *
* @return string Absolute URL * @return string Absolute URL
*/ */
public static function rewrite_relative($base_url, $rel_url) { public static function rewrite_relative($base_url, $rel_url, string $owner_element = "", string $owner_attribute = "") {
$rel_parts = parse_url($rel_url); $rel_parts = parse_url($rel_url);
if (!empty($rel_parts['host']) && !empty($rel_parts['scheme'])) { if (!empty($rel_parts['host']) && !empty($rel_parts['scheme'])) {
return self::validate($rel_url); return self::validate($rel_url);
// protocol-relative URL (rare but they exist)
} else if (strpos($rel_url, "//") === 0) { } else if (strpos($rel_url, "//") === 0) {
# protocol-relative URL (rare but they exist)
return self::validate("https:" . $rel_url); return self::validate("https:" . $rel_url);
} else if (array_search($rel_parts["scheme"] ?? "", self::ALLOWED_RELATIVE_SCHEMES, true) !== false) { // allow some extra schemes for A href
} else if (in_array($rel_parts["scheme"] ?? "", self::EXTRA_HREF_SCHEMES) &&
$owner_element == "a" &&
$owner_attribute == "href") {
return $rel_url;
// allow limited subset of inline base64-encoded images for IMG elements
} else if ($rel_parts["scheme"] == "data" &&
preg_match('%^image/(webp|gif|jpg|png|svg);base64,%', $rel_parts["path"]) &&
$owner_element == "img" &&
$owner_attribute == "src") {
return $rel_url; return $rel_url;
} else { } else {
$base_parts = parse_url($base_url); $base_parts = parse_url($base_url);