From 8b6ef90385874cefcb904e59801b3c0482805849 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Wed, 13 Feb 2019 14:49:27 +0300 Subject: [PATCH] update php-readability --- .../andreskrey/Readability/Configuration.php | 42 ++- .../Readability/Nodes/DOM/DOMNode.php | 1 + .../Readability/Nodes/NodeTrait.php | 162 ++++++++-- .../Readability/Nodes/NodeUtility.php | 11 +- vendor/andreskrey/Readability/Readability.php | 293 +++++++++++------- 5 files changed, 359 insertions(+), 150 deletions(-) diff --git a/vendor/andreskrey/Readability/Configuration.php b/vendor/andreskrey/Readability/Configuration.php index 951740ed0..6c17bc757 100644 --- a/vendor/andreskrey/Readability/Configuration.php +++ b/vendor/andreskrey/Readability/Configuration.php @@ -21,7 +21,7 @@ class Configuration /** * @var int */ - protected $wordThreshold = 500; + protected $charThreshold = 500; /** * @var bool @@ -109,9 +109,9 @@ class Configuration // If no logger has been set, just return a null logger if ($this->logger === null) { return new NullLogger(); - } else { - return $this->logger; } + + return $this->logger; } /** @@ -149,19 +149,45 @@ class Configuration /** * @return int */ - public function getWordThreshold() + public function getCharThreshold() { - return $this->wordThreshold; + return $this->charThreshold; } /** - * @param int $wordThreshold + * @param int $charThreshold * * @return $this */ - public function setWordThreshold($wordThreshold) + public function setCharThreshold($charThreshold) { - $this->wordThreshold = $wordThreshold; + $this->charThreshold = $charThreshold; + + return $this; + } + + /** + * @deprecated Use getCharThreshold. Will be removed in version 2.0 + * + * @return int + */ + public function getWordThreshold() + { + @trigger_error('getWordThreshold was replaced with getCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED); + + return $this->charThreshold; + } + + /** + * @param int $charThreshold + * + * @return $this + */ + public function setWordThreshold($charThreshold) + { + @trigger_error('setWordThreshold was replaced with setCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED); + + $this->charThreshold = $charThreshold; return $this; } diff --git a/vendor/andreskrey/Readability/Nodes/DOM/DOMNode.php b/vendor/andreskrey/Readability/Nodes/DOM/DOMNode.php index f1944c44b..7c3c4f3a2 100644 --- a/vendor/andreskrey/Readability/Nodes/DOM/DOMNode.php +++ b/vendor/andreskrey/Readability/Nodes/DOM/DOMNode.php @@ -6,6 +6,7 @@ use andreskrey\Readability\Nodes\NodeTrait; /** * @method getAttribute($attribute) + * @method hasAttribute($attribute) */ class DOMNode extends \DOMNode { diff --git a/vendor/andreskrey/Readability/Nodes/NodeTrait.php b/vendor/andreskrey/Readability/Nodes/NodeTrait.php index 13611c9e7..d7060ccbb 100644 --- a/vendor/andreskrey/Readability/Nodes/NodeTrait.php +++ b/vendor/andreskrey/Readability/Nodes/NodeTrait.php @@ -6,6 +6,7 @@ use andreskrey\Readability\Nodes\DOM\DOMDocument; use andreskrey\Readability\Nodes\DOM\DOMElement; use andreskrey\Readability\Nodes\DOM\DOMNode; use andreskrey\Readability\Nodes\DOM\DOMText; +use DOMNodeList; /** * @method \DOMNode removeAttribute($name) @@ -50,6 +51,21 @@ trait NodeTrait 'select', ]; + /** + * The commented out elements qualify as phrasing content but tend to be + * removed by readability when put into paragraphs, so we ignore them here. + * + * @var array + */ + private $phrasing_elems = [ + // 'CANVAS', 'IFRAME', 'SVG', 'VIDEO', + 'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data', + 'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label', + 'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q', + 'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', + 'sup', 'textarea', 'time', 'var', 'wbr' + ]; + /** * initialized getter. * @@ -65,7 +81,19 @@ trait NodeTrait */ public function isReadabilityDataTable() { - return $this->readabilityDataTable; + /* + * This is a workaround that I'd like to remove in the future. + * Seems that although we are extending the base DOMElement and adding custom properties (like this one, + * 'readabilityDataTable'), these properties get lost when you search for elements with getElementsByTagName. + * This means that even if we mark the tables in a previous step, when we want to retrieve that information, + * all the custom properties are in their default values. Somehow we need to find a way to make these properties + * permanent across the whole DOM. + * + * @see https://stackoverflow.com/questions/35654709/php-registernodeclass-and-reusing-variable-names + */ + return $this->hasAttribute('readabilityDataTable') + && $this->getAttribute('readabilityDataTable') === '1'; +// return $this->readabilityDataTable; } /** @@ -73,7 +101,9 @@ trait NodeTrait */ public function setReadabilityDataTable($param) { - $this->readabilityDataTable = $param; + // Can't be "true" because DOMDocument casts it to "1" + $this->setAttribute('readabilityDataTable', $param ? '1' : '0'); +// $this->readabilityDataTable = $param; } /** @@ -148,6 +178,24 @@ trait NodeTrait return ''; } + /** + * Override for native hasAttribute. + * + * @see getAttribute + * + * @param $attributeName + * + * @return bool + */ + public function hasAttribute($attributeName) + { + if (!is_null($this->attributes)) { + return parent::hasAttribute($attributeName); + } + + return false; + } + /** * Get the ancestors of the current node. * @@ -332,22 +380,26 @@ trait NodeTrait * Check if a given node has one of its ancestor tag name matching the * provided one. * - * @param DOMElement $node * @param string $tagName * @param int $maxDepth + * @param callable $filterFn * * @return bool */ - public function hasAncestorTag($node, $tagName, $maxDepth = 3) + public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null) { $depth = 0; + $node = $this; + while ($node->parentNode) { if ($maxDepth > 0 && $depth > $maxDepth) { return false; } - if ($node->parentNode->nodeName === $tagName) { + + if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) { return true; } + $node = $node->parentNode; $depth++; } @@ -356,30 +408,29 @@ trait NodeTrait } /** - * Checks if the current node has a single child and if that child is a P node. - * Useful to convert

nodes to a single

node and avoid confusing the scoring system since div with p - * tags are, in practice, paragraphs. + * Check if this node has only whitespace and a single element with given tag + * or if it contains no element with given tag or more than 1 element. * - * @param DOMNode $node + * @param $tag string Name of tag * * @return bool */ - public function hasSinglePNode() + public function hasSingleTagInsideElement($tag) { - // There should be exactly 1 element child which is a P: - if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== 'p') { + // There should be exactly 1 element child with given tag + if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== $tag) { return false; } - // And there should be no text nodes with real content (param true on ->getChildren) - foreach ($children as $child) { - /** @var $child DOMNode */ - if ($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())) { + // And there should be no text nodes with real content + return array_reduce($children, function ($carry, $child) { + if (!$carry === false) { return false; } - } - return true; + /* @var DOMNode $child */ + return !($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())); + }); } /** @@ -431,4 +482,79 @@ trait NodeTrait ); } + + /** + * Determine if a node qualifies as phrasing content. + * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content. + * + * @return bool + */ + public function isPhrasingContent() + { + return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false || + (!is_null($this->childNodes) && + ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') && + array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) { + return $node->isPhrasingContent() && $carry; + }, true) + ); + } + + public function isProbablyVisible() + { + /* + * In the original JS project they check if the node has the style display=none, which unfortunately + * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none". + * + * Might be a good idea to check for classes or other attributes like 'aria-hidden' + */ + + return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden'); + } + + public function isWhitespace() + { + return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || + ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); + } + + /** + * This is a hack that overcomes the issue of node shifting when scanning and removing nodes. + * + * In the JS version of getElementsByTagName, if you remove a node it will not appear during the + * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an + * orphan node and will give an exception if you try to do anything with it. + * + * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are + * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that + * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries + * to access node 6) + * + * This function solves this by searching for the nodes on every loop and keeping track of the count differences. + * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be + * used only when the results of the search are going to be used to remove the nodes. + * + * @param string $tag + * + * @return \Generator + */ + public function shiftingAwareGetElementsByTagName($tag) + { + /** @var $nodes DOMNodeList */ + $nodes = $this->getElementsByTagName($tag); + $count = $nodes->length; + + for ($i = 0; $i < $count; $i = max(++$i, 0)) { + yield $nodes->item($i); + + // Search for all the nodes again + $nodes = $this->getElementsByTagName($tag); + + // Subtract the amount of nodes removed from the current index + $i -= $count - $nodes->length; + + // Subtract the amount of nodes removed from the current count + $count -= ($count - $nodes->length); + } + } } diff --git a/vendor/andreskrey/Readability/Nodes/NodeUtility.php b/vendor/andreskrey/Readability/Nodes/NodeUtility.php index 752e9f410..7a1f18ee4 100644 --- a/vendor/andreskrey/Readability/Nodes/NodeUtility.php +++ b/vendor/andreskrey/Readability/Nodes/NodeUtility.php @@ -17,13 +17,13 @@ class NodeUtility * @var array */ public static $regexps = [ - 'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', + 'unlikelyCandidates' => '/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', 'byline' => '/byline|author|dateline|writtenby|p-author/i', 'replaceFonts' => '/<(\/?)font[^>]*>/gi', 'normalize' => '/\s{2,}/', - 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i', + 'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i', 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', 'prevLink' => '/(prev|earl|old|new|<|«)/i', 'whitespace' => '/^\s*$/', @@ -45,8 +45,8 @@ class NodeUtility { $next = $node; while ($next - && $next->nodeName !== '#text' - && trim($next->textContent)) { + && $next->nodeType !== XML_ELEMENT_NODE + && $next->isWhitespace()) { $next = $next->nextSibling; } @@ -57,12 +57,13 @@ class NodeUtility * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new * element with the new tag name and importing it to the main DOMDocument. * + * @param DOMNode $node * @param string $value * @param bool $importAttributes * * @return DOMNode */ - public static function setNodeTag($node, $value, $importAttributes = false) + public static function setNodeTag($node, $value, $importAttributes = true) { $new = new DOMDocument('1.0', 'utf-8'); $new->appendChild($new->createElement($value)); diff --git a/vendor/andreskrey/Readability/Readability.php b/vendor/andreskrey/Readability/Readability.php index 93fc81070..7b7eed6bf 100644 --- a/vendor/andreskrey/Readability/Readability.php +++ b/vendor/andreskrey/Readability/Readability.php @@ -127,7 +127,7 @@ class Readability * * @throws ParseException * - * @return array|bool + * @return bool */ public function parse($html) { @@ -164,14 +164,11 @@ class Readability $length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent)); - $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getWordThreshold())); + $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold())); - $parseSuccessful = true; - - if ($result && $length < $this->configuration->getWordThreshold()) { + if ($result && $length < $this->configuration->getCharThreshold()) { $this->dom = $this->loadHTML($html); $root = $this->dom->getElementsByTagName('body')->item(0); - $parseSuccessful = false; if ($this->configuration->getStripUnlikelyCandidates()) { $this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false'); @@ -204,7 +201,6 @@ class Readability $this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.'); $result = $this->attempts[0]['articleContent']; - $parseSuccessful = true; break; } } else { @@ -212,26 +208,24 @@ class Readability } } - if ($parseSuccessful) { - $result = $this->postProcessContent($result); + $result = $this->postProcessContent($result); - // If we haven't found an excerpt in the article's metadata, use the article's - // first paragraph as the excerpt. This can be used for displaying a preview of - // the article's content. - if (!$this->getExcerpt()) { - $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.'); - $paragraphs = $result->getElementsByTagName('p'); - if ($paragraphs->length > 0) { - $this->setExcerpt(trim($paragraphs->item(0)->textContent)); - } + // If we haven't found an excerpt in the article's metadata, use the article's + // first paragraph as the excerpt. This can be used for displaying a preview of + // the article's content. + if (!$this->getExcerpt()) { + $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.'); + $paragraphs = $result->getElementsByTagName('p'); + if ($paragraphs->length > 0) { + $this->setExcerpt(trim($paragraphs->item(0)->textContent)); } - - $this->setContent($result); - - $this->logger->info('*** Parse successful :)'); - - return true; } + + $this->setContent($result); + + $this->logger->info('*** Parse successful :)'); + + return true; } /** @@ -292,77 +286,98 @@ class Readability $this->logger->debug('[Metadata] Retrieving metadata...'); $values = []; - // Match "description", or Twitter's "twitter:description" (Cards) - // in name attribute. - $namePattern = '/^\s*((twitter)\s*:\s*)?(description|title|image)\s*$/i'; + // property is a space-separated list of values + $propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image)\s*/i'; - // Match Facebook's Open Graph title & description properties. - $propertyPattern = '/^\s*og\s*:\s*(description|title|image)\s*$/i'; + // name is a single value + $namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image)\s*$/i'; + // Find description tags. foreach ($this->dom->getElementsByTagName('meta') as $meta) { /* @var DOMNode $meta */ $elementName = $meta->getAttribute('name'); $elementProperty = $meta->getAttribute('property'); - - if (in_array('author', [$elementName, $elementProperty])) { - $this->logger->info(sprintf('[Metadata] Found author: \'%s\'', $meta->getAttribute('content'))); - $this->setAuthor($meta->getAttribute('content')); - continue; - } - + $content = $meta->getAttribute('content'); + $matches = null; $name = null; - if (preg_match($namePattern, $elementName)) { - $name = $elementName; - } elseif (preg_match($propertyPattern, $elementProperty)) { - $name = $elementProperty; + + if ($elementProperty) { + if (preg_match($propertyPattern, $elementProperty, $matches)) { + for ($i = count($matches) - 1; $i >= 0; $i--) { + // Convert to lowercase, and remove any whitespace + // so we can match below. + $name = preg_replace('/\s/', '', mb_strtolower($matches[$i])); + // multiple authors + $values[$name] = trim($content); + } + } } - if ($name) { - $content = $meta->getAttribute('content'); + if (!$matches && $elementName && preg_match($namePattern, $elementName)) { + $name = $elementName; if ($content) { - // Convert to lowercase and remove any whitespace - // so we can match below. - $name = preg_replace('/\s/', '', strtolower($name)); + // Convert to lowercase, remove any whitespace, and convert dots + // to colons so we can match below. + $name = preg_replace(['/\s/', '/\./'], ['', ':'], mb_strtolower($name)); $values[$name] = trim($content); } } } - if (array_key_exists('description', $values)) { - $this->logger->info(sprintf('[Metadata] Found excerpt in \'description\' tag: \'%s\'', $values['description'])); - $this->setExcerpt($values['description']); - } elseif (array_key_exists('og:description', $values)) { - // Use facebook open graph description. - $this->logger->info(sprintf('[Metadata] Found excerpt in \'og:description\' tag: \'%s\'', $values['og:description'])); - $this->setExcerpt($values['og:description']); - } elseif (array_key_exists('twitter:description', $values)) { - // Use twitter cards description. - $this->logger->info(sprintf('[Metadata] Found excerpt in \'twitter:description\' tag: \'%s\'', $values['twitter:description'])); - $this->setExcerpt($values['twitter:description']); - } - $this->setTitle($this->getArticleTitle()); + // get title + /* + * This is a very convoluted way of extracting the first matching key of the $values array + * against a set of options. + * + * This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s. + * Will probably replace it with ??s after dropping support of PHP5.6 + */ + + $key = current(array_intersect([ + 'dc:title', + 'dcterm:title', + 'og:title', + 'weibo:article:title', + 'weibo:webpage:title', + 'title', + 'twitter:title' + ], array_keys($values))); + + $this->setTitle(isset($values[$key]) ? trim($values[$key]) : null); if (!$this->getTitle()) { - if (array_key_exists('og:title', $values)) { - // Use facebook open graph title. - $this->logger->info(sprintf('[Metadata] Found title in \'og:title\' tag: \'%s\'', $values['og:title'])); - $this->setTitle($values['og:title']); - } elseif (array_key_exists('twitter:title', $values)) { - // Use twitter cards title. - $this->logger->info(sprintf('[Metadata] Found title in \'twitter:title\' tag: \'%s\'', $values['twitter:title'])); - $this->setTitle($values['twitter:title']); - } + $this->setTitle($this->getArticleTitle()); } - if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) { - if (array_key_exists('og:image', $values)) { - $this->logger->info(sprintf('[Metadata] Found main image in \'og:image\' tag: \'%s\'', $values['og:image'])); - $this->setImage($values['og:image']); - } else { - $this->logger->info(sprintf('[Metadata] Found main image in \'twitter:image\' tag: \'%s\'', $values['twitter:image'])); - $this->setImage($values['twitter:image']); - } - } + // get author + $key = current(array_intersect([ + 'dc:creator', + 'dcterm:creator', + 'author' + ], array_keys($values))); + + $this->setAuthor(isset($values[$key]) ? $values[$key] : null); + + // get description + $key = current(array_intersect([ + 'dc:description', + 'dcterm:description', + 'og:description', + 'weibo:article:description', + 'weibo:webpage:description', + 'description', + 'twitter:description' + ], array_keys($values))); + + $this->setExcerpt(isset($values[$key]) ? $values[$key] : null); + + // get main image + $key = current(array_intersect([ + 'og:image', + 'twitter:image' + ], array_keys($values))); + + $this->setImage(isset($values[$key]) ? $values[$key] : null); } /** @@ -453,7 +468,7 @@ class Readability return null; } - $curTitle = $originalTitle; + $curTitle = $originalTitle = trim($originalTitle); $titleHadHierarchicalSeparators = false; /* @@ -623,8 +638,6 @@ class Readability */ while ($node) { - $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id'); - // Remove DOMComments nodes as we don't need them and mess up children counting if ($node->nodeType === XML_COMMENT_NODE) { $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); @@ -632,6 +645,14 @@ class Readability continue; } + $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id'); + + if (!$node->isProbablyVisible()) { + $this->logger->debug(sprintf('[Get Nodes] Removing hidden node... Match string was: \'%s\'', $matchString)); + $node = NodeUtility::removeAndGetNext($node); + continue; + } + // Check to see if this node is a byline, and remove it if it is. if ($this->checkByline($node, $matchString)) { $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); @@ -671,13 +692,35 @@ class Readability // Turn all divs that don't have children block level elements into p's if ($node->nodeName === 'div') { + // Put phrasing content into paragraphs. + $p = null; + $childNode = $node->firstChild; + while ($childNode) { + $nextSibling = $childNode->nextSibling; + if ($childNode->isPhrasingContent()) { + if ($p !== null) { + $p->appendChild($childNode); + } elseif (!$childNode->isWhitespace()) { + $p = $this->dom->createElement('p'); + $node->replaceChild($p, $childNode); + $p->appendChild($childNode); + } + } elseif ($p !== null) { + while ($p->lastChild && $p->lastChild->isWhitespace()) { + $p->removeChild($p->lastChild); + } + $p = null; + } + $childNode = $nextSibling; + } + /* * Sites like http://mobile.slate.com encloses each paragraph with a DIV * element. DIVs with only a P element inside and no text content can be * safely converted into plain P elements to avoid confusing the scoring * algorithm with DIVs with are, in practice, paragraphs. */ - if ($node->hasSinglePNode()) { + if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) { $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); $pNode = $node->getChildren(true)[0]; $node->parentNode->replaceChild($pNode, $node); @@ -687,16 +730,6 @@ class Readability $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); $node = NodeUtility::setNodeTag($node, 'p'); $elementsToScore[] = $node; - } else { - // EXPERIMENTAL - foreach ($node->getChildren() as $child) { - /** @var $child DOMNode */ - if ($child->nodeType === XML_TEXT_NODE && mb_strlen(trim($child->getTextContent())) > 0) { - $this->logger->debug(sprintf('[Get Nodes] Found DIV a text node inside, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); - $newNode = $node->createNode($child, 'p'); - $child->parentNode->replaceChild($newNode, $child); - } - } } } @@ -751,7 +784,7 @@ class Readability if (gettype($text) == 'string') { $byline = trim($text); - return (mb_strlen($byline) > 0) && (mb_strlen($text) < 100); + return (mb_strlen($byline) > 0) && (mb_strlen($byline) < 100); } return false; @@ -764,15 +797,10 @@ class Readability */ private function removeScripts(DOMDocument $dom) { - $toRemove = ['script', 'noscript']; - - foreach ($toRemove as $tag) { - while ($script = $dom->getElementsByTagName($tag)) { - if ($script->item(0)) { - $script->item(0)->parentNode->removeChild($script->item(0)); - } else { - break; - } + foreach (['script', 'noscript'] as $tag) { + $nodes = $dom->getElementsByTagName($tag); + foreach (iterator_to_array($nodes) as $node) { + NodeUtility::removeNode($node); } } } @@ -786,15 +814,7 @@ class Readability { $this->logger->info('[PrepDocument] Preparing document for parsing...'); - /* - * DOMNodeList must be converted to an array before looping over it. - * This is done to avoid node shifting when removing nodes. - * - * Reverse traversing cannot be done here because we need to find brs that are right next to other brs. - * (If we go the other way around we need to search for previous nodes forcing the creation of new functions - * that will be used only here) - */ - foreach (iterator_to_array($dom->getElementsByTagName('br')) as $br) { + foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) { $next = $br->nextSibling; /* @@ -831,12 +851,16 @@ class Readability while ($next) { // If we've hit another

, we're done adding children to this

. if ($next->nodeName === 'br') { - $nextElem = NodeUtility::nextElement($next); + $nextElem = NodeUtility::nextElement($next->nextSibling); if ($nextElem && $nextElem->nodeName === 'br') { break; } } + if (!$next->isPhrasingContent()) { + break; + } + $this->logger->debug('[PrepDocument] Replacing BR with a P node...'); // Otherwise, make this node a child of the new

. @@ -844,6 +868,14 @@ class Readability $p->appendChild($next); $next = $sibling; } + + while ($p->lastChild && $p->lastChild->isWhitespace()) { + $p->removeChild($p->lastChild); + } + + if ($p->parentNode->tagName === 'p') { + NodeUtility::setNodeTag($p->parentNode, 'div'); + } } } @@ -853,7 +885,7 @@ class Readability for ($i = 0; $i < $length; $i++) { $this->logger->debug('[PrepDocument] Converting font tag into a span tag.'); $font = $fonts->item($length - 1 - $i); - NodeUtility::setNodeTag($font, 'span', true); + NodeUtility::setNodeTag($font, 'span'); } } @@ -989,7 +1021,9 @@ class Readability // and whose scores are quite closed with current `topCandidate` node. $alternativeCandidateAncestors = []; for ($i = 1; $i < count($topCandidates); $i++) { - if ($topCandidates[$i]->contentScore / $topCandidate->contentScore >= 0.75) { + // In some cases we may end up with a top candidate with zero content score. To avoid dividing by zero + // we have to use max() and replace zero with a low value like 0.1 + if ($topCandidates[$i]->contentScore / max($topCandidate->contentScore, 0.1) >= 0.75) { array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false)); } } @@ -997,7 +1031,9 @@ class Readability $MINIMUM_TOPCANDIDATES = 3; if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) { $parentOfTopCandidate = $topCandidate->parentNode; - while ($parentOfTopCandidate->nodeName !== 'body') { + + // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher + while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) { $listsContainingThisAncestor = 0; for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) { $listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]); @@ -1027,8 +1063,7 @@ class Readability $scoreThreshold = $lastScore / 3; /* @var DOMElement $parentOfTopCandidate */ - // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher - while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) { + while ($parentOfTopCandidate->nodeName !== 'body') { $parentScore = $parentOfTopCandidate->contentScore; if ($parentScore < $scoreThreshold) { break; @@ -1175,6 +1210,7 @@ class Readability $this->_clean($article, 'h1'); $this->_clean($article, 'footer'); $this->_clean($article, 'link'); + $this->_clean($article, 'aside'); // Clean out elements have "share" in their id/class combinations from final top candidates, // which means we don't remove the top candidates even they have "share". @@ -1227,6 +1263,22 @@ class Readability } } + // Remove single-cell tables + foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) { + /** @var DOMNode $table */ + $tbody = $table->hasSingleTagInsideElement('tbody') ? $table->childNodes[0] : $table; + if ($tbody->hasSingleTagInsideElement('tr')) { + $row = $tbody->firstChild; + if ($row->hasSingleTagInsideElement('td')) { + $cell = $row->firstChild; + $cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function ($carry, $node) { + return $node->isPhrasingContent() && $carry; + }, true)) ? 'p' : 'div'); + $table->parentNode->replaceChild($cell, $table); + } + } + } + return $article; } @@ -1374,6 +1426,7 @@ class Readability /** * @param DOMDocument $article + * @param string $tag Tag to clean conditionally * * @return void */ @@ -1398,7 +1451,9 @@ class Readability $node = $DOMNodeList->item($length - 1 - $i); // First check if we're in a data table, in which case don't remove us. - if ($node->hasAncestorTag($node, 'table', -1) && $node->isReadabilityDataTable()) { + if ($node->hasAncestorTag('table', -1, function ($node) { + return $node->isReadabilityDataTable(); + })) { continue; } @@ -1439,10 +1494,10 @@ class Readability $contentLength = mb_strlen($node->getTextContent(true)); $haveToRemove = - ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag($node, 'figure')) || + ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) || (!$isList && $li > $p) || ($input > floor($p / 3)) || - (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag($node, 'figure')) || + (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) || (!$isList && $weight < 25 && $linkDensity > 0.2) || ($weight >= 25 && $linkDensity > 0.5) || (($embedCount === 1 && $contentLength < 75) || $embedCount > 1); @@ -1477,7 +1532,7 @@ class Readability // Allow youtube and vimeo videos through as people usually want to see those. if ($isEmbed) { $attributeValues = []; - foreach ($item->attributes as $name => $value) { + foreach ($item->attributes as $value) { $attributeValues[] = $value->nodeValue; } $attributeValues = implode('|', $attributeValues);