feeditem_atom: support xml:base for enclosures and entry content

UrlHelper::rewrite_relative: use base URL path if relative url path is not absolute (experimental)
This commit is contained in:
Andrew Dolgov 2021-05-21 15:39:41 +03:00
parent d09a64d6f9
commit dff479af64
3 changed files with 60 additions and 16 deletions

View File

@ -60,43 +60,76 @@ class FeedItem_Atom extends FeedItem_Common {
}
}
/** $base is optional (returns $content if $base is null), $content is an HTML string */
private function rewrite_content_to_base($base, $content) {
if (!empty($base) && !empty($content)) {
$tmpdoc = new DOMDocument();
if (@$tmpdoc->loadHTML('<?xml encoding="UTF-8">' . $content)) {
$tmpxpath = new DOMXPath($tmpdoc);
$elems = $tmpxpath->query("(//*[@href]|//*[@src])");
foreach ($elems as $elem) {
if ($elem->hasAttribute("href")) {
$elem->setAttribute("href",
UrlHelper::rewrite_relative($base, $elem->getAttribute("href")));
} else if ($elem->hasAttribute("src")) {
$elem->setAttribute("src",
UrlHelper::rewrite_relative($base, $elem->getAttribute("src")));
}
}
return $tmpdoc->saveXML();
}
}
return $content;
}
function get_content() {
$content = $this->elem->getElementsByTagName("content")->item(0);
if ($content) {
$base = $this->xpath->evaluate("string(ancestor-or-self::*[@xml:base][1]/@xml:base)", $content);
if ($content->hasAttribute('type')) {
if ($content->getAttribute('type') == 'xhtml') {
for ($i = 0; $i < $content->childNodes->length; $i++) {
$child = $content->childNodes->item($i);
if ($child->hasChildNodes()) {
return $this->doc->saveHTML($child);
return $this->rewrite_content_to_base($base, $this->doc->saveHTML($child));
}
}
}
}
return $this->subtree_or_text($content);
return $this->rewrite_content_to_base($base, $this->subtree_or_text($content));
}
}
// TODO: duplicate code should be merged with get_content()
function get_description() {
$content = $this->elem->getElementsByTagName("summary")->item(0);
if ($content) {
$base = $this->xpath->evaluate("string(ancestor-or-self::*[@xml:base][1]/@xml:base)", $content);
if ($content->hasAttribute('type')) {
if ($content->getAttribute('type') == 'xhtml') {
for ($i = 0; $i < $content->childNodes->length; $i++) {
$child = $content->childNodes->item($i);
if ($child->hasChildNodes()) {
return $this->doc->saveHTML($child);
return $this->rewrite_content_to_base($base, $this->doc->saveHTML($child));
}
}
}
}
return $this->subtree_or_text($content);
return $this->rewrite_content_to_base($base, $this->subtree_or_text($content));
}
}
@ -122,16 +155,22 @@ class FeedItem_Atom extends FeedItem_Common {
function get_enclosures() {
$links = $this->elem->getElementsByTagName("link");
$encs = array();
$encs = [];
foreach ($links as $link) {
if ($link && $link->hasAttribute("href") && $link->hasAttribute("rel")) {
$base = $this->xpath->evaluate("string(ancestor-or-self::*[@xml:base][1]/@xml:base)", $link);
if ($link->getAttribute("rel") == "enclosure") {
$enc = new FeedEnclosure();
$enc->type = clean($link->getAttribute("type"));
$enc->link = clean($link->getAttribute("href"));
$enc->length = clean($link->getAttribute("length"));
$enc->link = clean($link->getAttribute("href"));
if (!empty($base)) {
$enc->link = UrlHelper::rewrite_relative($base, $enc->link);
}
array_push($encs, $enc);
}

View File

@ -20,14 +20,14 @@ class UrlHelper {
}
/**
* Converts a (possibly) relative URL to a absolute one.
* Converts a (possibly) relative URL to a absolute one, using provided base URL.
*
* @param string $url Base URL (i.e. from where the document is)
* @param string $base_url Base URL (i.e. from where the document is)
* @param string $rel_url Possibly relative URL in the document
*
* @return string Absolute URL
*/
public static function rewrite_relative($url, $rel_url) {
public static function rewrite_relative($base_url, $rel_url) {
$rel_parts = parse_url($rel_url);
@ -40,14 +40,19 @@ class UrlHelper {
# allow magnet links
return $rel_url;
} else {
$parts = parse_url($url);
$base_parts = parse_url($base_url);
$rel_parts['host'] = $parts['host'];
$rel_parts['scheme'] = $parts['scheme'];
$rel_parts['host'] = $base_parts['host'];
$rel_parts['scheme'] = $base_parts['scheme'];
if (isset($rel_parts['path'])) {
if (strpos($rel_parts['path'], '/') !== 0)
$rel_parts['path'] = '/' . $rel_parts['path'];
// experimental: if relative url path is not absolute (i.e. starting with /) concatenate it using base url path
// (i'm not sure if it's a good idea)
if (strpos($rel_parts['path'], '/') !== 0) {
$rel_parts['path'] = with_trailing_slash($base_parts['path']) . $rel_parts['path'];
}
$rel_parts['path'] = str_replace("/./", "/", $rel_parts['path']);
$rel_parts['path'] = str_replace("//", "/", $rel_parts['path']);

View File

@ -193,8 +193,8 @@
}
/** function is @deprecated */
function rewrite_relative_url($url, $rel_url) {
return UrlHelper::rewrite_relative($url, $rel_url);
function rewrite_relative_url($base_url, $rel_url) {
return UrlHelper::rewrite_relative($base_url, $rel_url);
}
/** function is @deprecated */