update php-readability
This commit is contained in:
parent
874a2d2170
commit
8b6ef90385
|
@ -21,7 +21,7 @@ class Configuration
|
|||
/**
|
||||
* @var int
|
||||
*/
|
||||
protected $wordThreshold = 500;
|
||||
protected $charThreshold = 500;
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
|
@ -109,9 +109,9 @@ class Configuration
|
|||
// If no logger has been set, just return a null logger
|
||||
if ($this->logger === null) {
|
||||
return new NullLogger();
|
||||
} else {
|
||||
return $this->logger;
|
||||
}
|
||||
|
||||
return $this->logger;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -149,19 +149,45 @@ class Configuration
|
|||
/**
|
||||
* @return int
|
||||
*/
|
||||
public function getWordThreshold()
|
||||
public function getCharThreshold()
|
||||
{
|
||||
return $this->wordThreshold;
|
||||
return $this->charThreshold;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $wordThreshold
|
||||
* @param int $charThreshold
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setWordThreshold($wordThreshold)
|
||||
public function setCharThreshold($charThreshold)
|
||||
{
|
||||
$this->wordThreshold = $wordThreshold;
|
||||
$this->charThreshold = $charThreshold;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use getCharThreshold. Will be removed in version 2.0
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
public function getWordThreshold()
|
||||
{
|
||||
@trigger_error('getWordThreshold was replaced with getCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
|
||||
|
||||
return $this->charThreshold;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $charThreshold
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setWordThreshold($charThreshold)
|
||||
{
|
||||
@trigger_error('setWordThreshold was replaced with setCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
|
||||
|
||||
$this->charThreshold = $charThreshold;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@ use andreskrey\Readability\Nodes\NodeTrait;
|
|||
|
||||
/**
|
||||
* @method getAttribute($attribute)
|
||||
* @method hasAttribute($attribute)
|
||||
*/
|
||||
class DOMNode extends \DOMNode
|
||||
{
|
||||
|
|
|
@ -6,6 +6,7 @@ use andreskrey\Readability\Nodes\DOM\DOMDocument;
|
|||
use andreskrey\Readability\Nodes\DOM\DOMElement;
|
||||
use andreskrey\Readability\Nodes\DOM\DOMNode;
|
||||
use andreskrey\Readability\Nodes\DOM\DOMText;
|
||||
use DOMNodeList;
|
||||
|
||||
/**
|
||||
* @method \DOMNode removeAttribute($name)
|
||||
|
@ -50,6 +51,21 @@ trait NodeTrait
|
|||
'select',
|
||||
];
|
||||
|
||||
/**
|
||||
* The commented out elements qualify as phrasing content but tend to be
|
||||
* removed by readability when put into paragraphs, so we ignore them here.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
private $phrasing_elems = [
|
||||
// 'CANVAS', 'IFRAME', 'SVG', 'VIDEO',
|
||||
'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data',
|
||||
'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label',
|
||||
'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q',
|
||||
'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub',
|
||||
'sup', 'textarea', 'time', 'var', 'wbr'
|
||||
];
|
||||
|
||||
/**
|
||||
* initialized getter.
|
||||
*
|
||||
|
@ -65,7 +81,19 @@ trait NodeTrait
|
|||
*/
|
||||
public function isReadabilityDataTable()
|
||||
{
|
||||
return $this->readabilityDataTable;
|
||||
/*
|
||||
* This is a workaround that I'd like to remove in the future.
|
||||
* Seems that although we are extending the base DOMElement and adding custom properties (like this one,
|
||||
* 'readabilityDataTable'), these properties get lost when you search for elements with getElementsByTagName.
|
||||
* This means that even if we mark the tables in a previous step, when we want to retrieve that information,
|
||||
* all the custom properties are in their default values. Somehow we need to find a way to make these properties
|
||||
* permanent across the whole DOM.
|
||||
*
|
||||
* @see https://stackoverflow.com/questions/35654709/php-registernodeclass-and-reusing-variable-names
|
||||
*/
|
||||
return $this->hasAttribute('readabilityDataTable')
|
||||
&& $this->getAttribute('readabilityDataTable') === '1';
|
||||
// return $this->readabilityDataTable;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -73,7 +101,9 @@ trait NodeTrait
|
|||
*/
|
||||
public function setReadabilityDataTable($param)
|
||||
{
|
||||
$this->readabilityDataTable = $param;
|
||||
// Can't be "true" because DOMDocument casts it to "1"
|
||||
$this->setAttribute('readabilityDataTable', $param ? '1' : '0');
|
||||
// $this->readabilityDataTable = $param;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -148,6 +178,24 @@ trait NodeTrait
|
|||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Override for native hasAttribute.
|
||||
*
|
||||
* @see getAttribute
|
||||
*
|
||||
* @param $attributeName
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function hasAttribute($attributeName)
|
||||
{
|
||||
if (!is_null($this->attributes)) {
|
||||
return parent::hasAttribute($attributeName);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the ancestors of the current node.
|
||||
*
|
||||
|
@ -332,22 +380,26 @@ trait NodeTrait
|
|||
* Check if a given node has one of its ancestor tag name matching the
|
||||
* provided one.
|
||||
*
|
||||
* @param DOMElement $node
|
||||
* @param string $tagName
|
||||
* @param int $maxDepth
|
||||
* @param callable $filterFn
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function hasAncestorTag($node, $tagName, $maxDepth = 3)
|
||||
public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null)
|
||||
{
|
||||
$depth = 0;
|
||||
$node = $this;
|
||||
|
||||
while ($node->parentNode) {
|
||||
if ($maxDepth > 0 && $depth > $maxDepth) {
|
||||
return false;
|
||||
}
|
||||
if ($node->parentNode->nodeName === $tagName) {
|
||||
|
||||
if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
$node = $node->parentNode;
|
||||
$depth++;
|
||||
}
|
||||
|
@ -356,30 +408,29 @@ trait NodeTrait
|
|||
}
|
||||
|
||||
/**
|
||||
* Checks if the current node has a single child and if that child is a P node.
|
||||
* Useful to convert <div><p> nodes to a single <p> node and avoid confusing the scoring system since div with p
|
||||
* tags are, in practice, paragraphs.
|
||||
* Check if this node has only whitespace and a single element with given tag
|
||||
* or if it contains no element with given tag or more than 1 element.
|
||||
*
|
||||
* @param DOMNode $node
|
||||
* @param $tag string Name of tag
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function hasSinglePNode()
|
||||
public function hasSingleTagInsideElement($tag)
|
||||
{
|
||||
// There should be exactly 1 element child which is a P:
|
||||
if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== 'p') {
|
||||
// There should be exactly 1 element child with given tag
|
||||
if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== $tag) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// And there should be no text nodes with real content (param true on ->getChildren)
|
||||
foreach ($children as $child) {
|
||||
/** @var $child DOMNode */
|
||||
if ($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())) {
|
||||
// And there should be no text nodes with real content
|
||||
return array_reduce($children, function ($carry, $child) {
|
||||
if (!$carry === false) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
/* @var DOMNode $child */
|
||||
return !($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent()));
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -431,4 +482,79 @@ trait NodeTrait
|
|||
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if a node qualifies as phrasing content.
|
||||
* https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isPhrasingContent()
|
||||
{
|
||||
return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false ||
|
||||
(!is_null($this->childNodes) &&
|
||||
($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') &&
|
||||
array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) {
|
||||
return $node->isPhrasingContent() && $carry;
|
||||
}, true)
|
||||
);
|
||||
}
|
||||
|
||||
public function isProbablyVisible()
|
||||
{
|
||||
/*
|
||||
* In the original JS project they check if the node has the style display=none, which unfortunately
|
||||
* in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none".
|
||||
*
|
||||
* Might be a good idea to check for classes or other attributes like 'aria-hidden'
|
||||
*/
|
||||
|
||||
return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden');
|
||||
}
|
||||
|
||||
public function isWhitespace()
|
||||
{
|
||||
return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
|
||||
($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br');
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a hack that overcomes the issue of node shifting when scanning and removing nodes.
|
||||
*
|
||||
* In the JS version of getElementsByTagName, if you remove a node it will not appear during the
|
||||
* foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an
|
||||
* orphan node and will give an exception if you try to do anything with it.
|
||||
*
|
||||
* Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are
|
||||
* removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that
|
||||
* never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries
|
||||
* to access node 6)
|
||||
*
|
||||
* This function solves this by searching for the nodes on every loop and keeping track of the count differences.
|
||||
* Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be
|
||||
* used only when the results of the search are going to be used to remove the nodes.
|
||||
*
|
||||
* @param string $tag
|
||||
*
|
||||
* @return \Generator
|
||||
*/
|
||||
public function shiftingAwareGetElementsByTagName($tag)
|
||||
{
|
||||
/** @var $nodes DOMNodeList */
|
||||
$nodes = $this->getElementsByTagName($tag);
|
||||
$count = $nodes->length;
|
||||
|
||||
for ($i = 0; $i < $count; $i = max(++$i, 0)) {
|
||||
yield $nodes->item($i);
|
||||
|
||||
// Search for all the nodes again
|
||||
$nodes = $this->getElementsByTagName($tag);
|
||||
|
||||
// Subtract the amount of nodes removed from the current index
|
||||
$i -= $count - $nodes->length;
|
||||
|
||||
// Subtract the amount of nodes removed from the current count
|
||||
$count -= ($count - $nodes->length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,13 +17,13 @@ class NodeUtility
|
|||
* @var array
|
||||
*/
|
||||
public static $regexps = [
|
||||
'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
|
||||
'unlikelyCandidates' => '/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
|
||||
'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
|
||||
'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
|
||||
'byline' => '/byline|author|dateline|writtenby|p-author/i',
|
||||
'replaceFonts' => '/<(\/?)font[^>]*>/gi',
|
||||
'normalize' => '/\s{2,}/',
|
||||
'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i',
|
||||
'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i',
|
||||
'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
|
||||
'prevLink' => '/(prev|earl|old|new|<|«)/i',
|
||||
'whitespace' => '/^\s*$/',
|
||||
|
@ -45,8 +45,8 @@ class NodeUtility
|
|||
{
|
||||
$next = $node;
|
||||
while ($next
|
||||
&& $next->nodeName !== '#text'
|
||||
&& trim($next->textContent)) {
|
||||
&& $next->nodeType !== XML_ELEMENT_NODE
|
||||
&& $next->isWhitespace()) {
|
||||
$next = $next->nextSibling;
|
||||
}
|
||||
|
||||
|
@ -57,12 +57,13 @@ class NodeUtility
|
|||
* Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
|
||||
* element with the new tag name and importing it to the main DOMDocument.
|
||||
*
|
||||
* @param DOMNode $node
|
||||
* @param string $value
|
||||
* @param bool $importAttributes
|
||||
*
|
||||
* @return DOMNode
|
||||
*/
|
||||
public static function setNodeTag($node, $value, $importAttributes = false)
|
||||
public static function setNodeTag($node, $value, $importAttributes = true)
|
||||
{
|
||||
$new = new DOMDocument('1.0', 'utf-8');
|
||||
$new->appendChild($new->createElement($value));
|
||||
|
|
|
@ -127,7 +127,7 @@ class Readability
|
|||
*
|
||||
* @throws ParseException
|
||||
*
|
||||
* @return array|bool
|
||||
* @return bool
|
||||
*/
|
||||
public function parse($html)
|
||||
{
|
||||
|
@ -164,14 +164,11 @@ class Readability
|
|||
|
||||
$length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));
|
||||
|
||||
$this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getWordThreshold()));
|
||||
$this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold()));
|
||||
|
||||
$parseSuccessful = true;
|
||||
|
||||
if ($result && $length < $this->configuration->getWordThreshold()) {
|
||||
if ($result && $length < $this->configuration->getCharThreshold()) {
|
||||
$this->dom = $this->loadHTML($html);
|
||||
$root = $this->dom->getElementsByTagName('body')->item(0);
|
||||
$parseSuccessful = false;
|
||||
|
||||
if ($this->configuration->getStripUnlikelyCandidates()) {
|
||||
$this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
|
||||
|
@ -204,7 +201,6 @@ class Readability
|
|||
$this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.');
|
||||
|
||||
$result = $this->attempts[0]['articleContent'];
|
||||
$parseSuccessful = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
|
@ -212,7 +208,6 @@ class Readability
|
|||
}
|
||||
}
|
||||
|
||||
if ($parseSuccessful) {
|
||||
$result = $this->postProcessContent($result);
|
||||
|
||||
// If we haven't found an excerpt in the article's metadata, use the article's
|
||||
|
@ -232,7 +227,6 @@ class Readability
|
|||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a DOM Document object and loads the provided HTML on it.
|
||||
|
@ -292,77 +286,98 @@ class Readability
|
|||
$this->logger->debug('[Metadata] Retrieving metadata...');
|
||||
|
||||
$values = [];
|
||||
// Match "description", or Twitter's "twitter:description" (Cards)
|
||||
// in name attribute.
|
||||
$namePattern = '/^\s*((twitter)\s*:\s*)?(description|title|image)\s*$/i';
|
||||
// property is a space-separated list of values
|
||||
$propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image)\s*/i';
|
||||
|
||||
// Match Facebook's Open Graph title & description properties.
|
||||
$propertyPattern = '/^\s*og\s*:\s*(description|title|image)\s*$/i';
|
||||
// name is a single value
|
||||
$namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image)\s*$/i';
|
||||
|
||||
// Find description tags.
|
||||
foreach ($this->dom->getElementsByTagName('meta') as $meta) {
|
||||
/* @var DOMNode $meta */
|
||||
$elementName = $meta->getAttribute('name');
|
||||
$elementProperty = $meta->getAttribute('property');
|
||||
|
||||
if (in_array('author', [$elementName, $elementProperty])) {
|
||||
$this->logger->info(sprintf('[Metadata] Found author: \'%s\'', $meta->getAttribute('content')));
|
||||
$this->setAuthor($meta->getAttribute('content'));
|
||||
continue;
|
||||
}
|
||||
|
||||
$name = null;
|
||||
if (preg_match($namePattern, $elementName)) {
|
||||
$name = $elementName;
|
||||
} elseif (preg_match($propertyPattern, $elementProperty)) {
|
||||
$name = $elementProperty;
|
||||
}
|
||||
|
||||
if ($name) {
|
||||
$content = $meta->getAttribute('content');
|
||||
if ($content) {
|
||||
// Convert to lowercase and remove any whitespace
|
||||
$matches = null;
|
||||
$name = null;
|
||||
|
||||
if ($elementProperty) {
|
||||
if (preg_match($propertyPattern, $elementProperty, $matches)) {
|
||||
for ($i = count($matches) - 1; $i >= 0; $i--) {
|
||||
// Convert to lowercase, and remove any whitespace
|
||||
// so we can match below.
|
||||
$name = preg_replace('/\s/', '', strtolower($name));
|
||||
$name = preg_replace('/\s/', '', mb_strtolower($matches[$i]));
|
||||
// multiple authors
|
||||
$values[$name] = trim($content);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (array_key_exists('description', $values)) {
|
||||
$this->logger->info(sprintf('[Metadata] Found excerpt in \'description\' tag: \'%s\'', $values['description']));
|
||||
$this->setExcerpt($values['description']);
|
||||
} elseif (array_key_exists('og:description', $values)) {
|
||||
// Use facebook open graph description.
|
||||
$this->logger->info(sprintf('[Metadata] Found excerpt in \'og:description\' tag: \'%s\'', $values['og:description']));
|
||||
$this->setExcerpt($values['og:description']);
|
||||
} elseif (array_key_exists('twitter:description', $values)) {
|
||||
// Use twitter cards description.
|
||||
$this->logger->info(sprintf('[Metadata] Found excerpt in \'twitter:description\' tag: \'%s\'', $values['twitter:description']));
|
||||
$this->setExcerpt($values['twitter:description']);
|
||||
|
||||
if (!$matches && $elementName && preg_match($namePattern, $elementName)) {
|
||||
$name = $elementName;
|
||||
if ($content) {
|
||||
// Convert to lowercase, remove any whitespace, and convert dots
|
||||
// to colons so we can match below.
|
||||
$name = preg_replace(['/\s/', '/\./'], ['', ':'], mb_strtolower($name));
|
||||
$values[$name] = trim($content);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$this->setTitle($this->getArticleTitle());
|
||||
// get title
|
||||
/*
|
||||
* This is a very convoluted way of extracting the first matching key of the $values array
|
||||
* against a set of options.
|
||||
*
|
||||
* This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s.
|
||||
* Will probably replace it with ??s after dropping support of PHP5.6
|
||||
*/
|
||||
|
||||
$key = current(array_intersect([
|
||||
'dc:title',
|
||||
'dcterm:title',
|
||||
'og:title',
|
||||
'weibo:article:title',
|
||||
'weibo:webpage:title',
|
||||
'title',
|
||||
'twitter:title'
|
||||
], array_keys($values)));
|
||||
|
||||
$this->setTitle(isset($values[$key]) ? trim($values[$key]) : null);
|
||||
|
||||
if (!$this->getTitle()) {
|
||||
if (array_key_exists('og:title', $values)) {
|
||||
// Use facebook open graph title.
|
||||
$this->logger->info(sprintf('[Metadata] Found title in \'og:title\' tag: \'%s\'', $values['og:title']));
|
||||
$this->setTitle($values['og:title']);
|
||||
} elseif (array_key_exists('twitter:title', $values)) {
|
||||
// Use twitter cards title.
|
||||
$this->logger->info(sprintf('[Metadata] Found title in \'twitter:title\' tag: \'%s\'', $values['twitter:title']));
|
||||
$this->setTitle($values['twitter:title']);
|
||||
}
|
||||
$this->setTitle($this->getArticleTitle());
|
||||
}
|
||||
|
||||
if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) {
|
||||
if (array_key_exists('og:image', $values)) {
|
||||
$this->logger->info(sprintf('[Metadata] Found main image in \'og:image\' tag: \'%s\'', $values['og:image']));
|
||||
$this->setImage($values['og:image']);
|
||||
} else {
|
||||
$this->logger->info(sprintf('[Metadata] Found main image in \'twitter:image\' tag: \'%s\'', $values['twitter:image']));
|
||||
$this->setImage($values['twitter:image']);
|
||||
}
|
||||
}
|
||||
// get author
|
||||
$key = current(array_intersect([
|
||||
'dc:creator',
|
||||
'dcterm:creator',
|
||||
'author'
|
||||
], array_keys($values)));
|
||||
|
||||
$this->setAuthor(isset($values[$key]) ? $values[$key] : null);
|
||||
|
||||
// get description
|
||||
$key = current(array_intersect([
|
||||
'dc:description',
|
||||
'dcterm:description',
|
||||
'og:description',
|
||||
'weibo:article:description',
|
||||
'weibo:webpage:description',
|
||||
'description',
|
||||
'twitter:description'
|
||||
], array_keys($values)));
|
||||
|
||||
$this->setExcerpt(isset($values[$key]) ? $values[$key] : null);
|
||||
|
||||
// get main image
|
||||
$key = current(array_intersect([
|
||||
'og:image',
|
||||
'twitter:image'
|
||||
], array_keys($values)));
|
||||
|
||||
$this->setImage(isset($values[$key]) ? $values[$key] : null);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -453,7 +468,7 @@ class Readability
|
|||
return null;
|
||||
}
|
||||
|
||||
$curTitle = $originalTitle;
|
||||
$curTitle = $originalTitle = trim($originalTitle);
|
||||
$titleHadHierarchicalSeparators = false;
|
||||
|
||||
/*
|
||||
|
@ -623,8 +638,6 @@ class Readability
|
|||
*/
|
||||
|
||||
while ($node) {
|
||||
$matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
|
||||
|
||||
// Remove DOMComments nodes as we don't need them and mess up children counting
|
||||
if ($node->nodeType === XML_COMMENT_NODE) {
|
||||
$this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
|
||||
|
@ -632,6 +645,14 @@ class Readability
|
|||
continue;
|
||||
}
|
||||
|
||||
$matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
|
||||
|
||||
if (!$node->isProbablyVisible()) {
|
||||
$this->logger->debug(sprintf('[Get Nodes] Removing hidden node... Match string was: \'%s\'', $matchString));
|
||||
$node = NodeUtility::removeAndGetNext($node);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check to see if this node is a byline, and remove it if it is.
|
||||
if ($this->checkByline($node, $matchString)) {
|
||||
$this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
|
||||
|
@ -671,13 +692,35 @@ class Readability
|
|||
|
||||
// Turn all divs that don't have children block level elements into p's
|
||||
if ($node->nodeName === 'div') {
|
||||
// Put phrasing content into paragraphs.
|
||||
$p = null;
|
||||
$childNode = $node->firstChild;
|
||||
while ($childNode) {
|
||||
$nextSibling = $childNode->nextSibling;
|
||||
if ($childNode->isPhrasingContent()) {
|
||||
if ($p !== null) {
|
||||
$p->appendChild($childNode);
|
||||
} elseif (!$childNode->isWhitespace()) {
|
||||
$p = $this->dom->createElement('p');
|
||||
$node->replaceChild($p, $childNode);
|
||||
$p->appendChild($childNode);
|
||||
}
|
||||
} elseif ($p !== null) {
|
||||
while ($p->lastChild && $p->lastChild->isWhitespace()) {
|
||||
$p->removeChild($p->lastChild);
|
||||
}
|
||||
$p = null;
|
||||
}
|
||||
$childNode = $nextSibling;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sites like http://mobile.slate.com encloses each paragraph with a DIV
|
||||
* element. DIVs with only a P element inside and no text content can be
|
||||
* safely converted into plain P elements to avoid confusing the scoring
|
||||
* algorithm with DIVs with are, in practice, paragraphs.
|
||||
*/
|
||||
if ($node->hasSinglePNode()) {
|
||||
if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) {
|
||||
$this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
|
||||
$pNode = $node->getChildren(true)[0];
|
||||
$node->parentNode->replaceChild($pNode, $node);
|
||||
|
@ -687,16 +730,6 @@ class Readability
|
|||
$this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
|
||||
$node = NodeUtility::setNodeTag($node, 'p');
|
||||
$elementsToScore[] = $node;
|
||||
} else {
|
||||
// EXPERIMENTAL
|
||||
foreach ($node->getChildren() as $child) {
|
||||
/** @var $child DOMNode */
|
||||
if ($child->nodeType === XML_TEXT_NODE && mb_strlen(trim($child->getTextContent())) > 0) {
|
||||
$this->logger->debug(sprintf('[Get Nodes] Found DIV a text node inside, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
|
||||
$newNode = $node->createNode($child, 'p');
|
||||
$child->parentNode->replaceChild($newNode, $child);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -751,7 +784,7 @@ class Readability
|
|||
if (gettype($text) == 'string') {
|
||||
$byline = trim($text);
|
||||
|
||||
return (mb_strlen($byline) > 0) && (mb_strlen($text) < 100);
|
||||
return (mb_strlen($byline) > 0) && (mb_strlen($byline) < 100);
|
||||
}
|
||||
|
||||
return false;
|
||||
|
@ -764,15 +797,10 @@ class Readability
|
|||
*/
|
||||
private function removeScripts(DOMDocument $dom)
|
||||
{
|
||||
$toRemove = ['script', 'noscript'];
|
||||
|
||||
foreach ($toRemove as $tag) {
|
||||
while ($script = $dom->getElementsByTagName($tag)) {
|
||||
if ($script->item(0)) {
|
||||
$script->item(0)->parentNode->removeChild($script->item(0));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
foreach (['script', 'noscript'] as $tag) {
|
||||
$nodes = $dom->getElementsByTagName($tag);
|
||||
foreach (iterator_to_array($nodes) as $node) {
|
||||
NodeUtility::removeNode($node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -786,15 +814,7 @@ class Readability
|
|||
{
|
||||
$this->logger->info('[PrepDocument] Preparing document for parsing...');
|
||||
|
||||
/*
|
||||
* DOMNodeList must be converted to an array before looping over it.
|
||||
* This is done to avoid node shifting when removing nodes.
|
||||
*
|
||||
* Reverse traversing cannot be done here because we need to find brs that are right next to other brs.
|
||||
* (If we go the other way around we need to search for previous nodes forcing the creation of new functions
|
||||
* that will be used only here)
|
||||
*/
|
||||
foreach (iterator_to_array($dom->getElementsByTagName('br')) as $br) {
|
||||
foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) {
|
||||
$next = $br->nextSibling;
|
||||
|
||||
/*
|
||||
|
@ -831,12 +851,16 @@ class Readability
|
|||
while ($next) {
|
||||
// If we've hit another <br><br>, we're done adding children to this <p>.
|
||||
if ($next->nodeName === 'br') {
|
||||
$nextElem = NodeUtility::nextElement($next);
|
||||
$nextElem = NodeUtility::nextElement($next->nextSibling);
|
||||
if ($nextElem && $nextElem->nodeName === 'br') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!$next->isPhrasingContent()) {
|
||||
break;
|
||||
}
|
||||
|
||||
$this->logger->debug('[PrepDocument] Replacing BR with a P node...');
|
||||
|
||||
// Otherwise, make this node a child of the new <p>.
|
||||
|
@ -844,6 +868,14 @@ class Readability
|
|||
$p->appendChild($next);
|
||||
$next = $sibling;
|
||||
}
|
||||
|
||||
while ($p->lastChild && $p->lastChild->isWhitespace()) {
|
||||
$p->removeChild($p->lastChild);
|
||||
}
|
||||
|
||||
if ($p->parentNode->tagName === 'p') {
|
||||
NodeUtility::setNodeTag($p->parentNode, 'div');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -853,7 +885,7 @@ class Readability
|
|||
for ($i = 0; $i < $length; $i++) {
|
||||
$this->logger->debug('[PrepDocument] Converting font tag into a span tag.');
|
||||
$font = $fonts->item($length - 1 - $i);
|
||||
NodeUtility::setNodeTag($font, 'span', true);
|
||||
NodeUtility::setNodeTag($font, 'span');
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -989,7 +1021,9 @@ class Readability
|
|||
// and whose scores are quite closed with current `topCandidate` node.
|
||||
$alternativeCandidateAncestors = [];
|
||||
for ($i = 1; $i < count($topCandidates); $i++) {
|
||||
if ($topCandidates[$i]->contentScore / $topCandidate->contentScore >= 0.75) {
|
||||
// In some cases we may end up with a top candidate with zero content score. To avoid dividing by zero
|
||||
// we have to use max() and replace zero with a low value like 0.1
|
||||
if ($topCandidates[$i]->contentScore / max($topCandidate->contentScore, 0.1) >= 0.75) {
|
||||
array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false));
|
||||
}
|
||||
}
|
||||
|
@ -997,7 +1031,9 @@ class Readability
|
|||
$MINIMUM_TOPCANDIDATES = 3;
|
||||
if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) {
|
||||
$parentOfTopCandidate = $topCandidate->parentNode;
|
||||
while ($parentOfTopCandidate->nodeName !== 'body') {
|
||||
|
||||
// Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher
|
||||
while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) {
|
||||
$listsContainingThisAncestor = 0;
|
||||
for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) {
|
||||
$listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]);
|
||||
|
@ -1027,8 +1063,7 @@ class Readability
|
|||
$scoreThreshold = $lastScore / 3;
|
||||
|
||||
/* @var DOMElement $parentOfTopCandidate */
|
||||
// Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher
|
||||
while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) {
|
||||
while ($parentOfTopCandidate->nodeName !== 'body') {
|
||||
$parentScore = $parentOfTopCandidate->contentScore;
|
||||
if ($parentScore < $scoreThreshold) {
|
||||
break;
|
||||
|
@ -1175,6 +1210,7 @@ class Readability
|
|||
$this->_clean($article, 'h1');
|
||||
$this->_clean($article, 'footer');
|
||||
$this->_clean($article, 'link');
|
||||
$this->_clean($article, 'aside');
|
||||
|
||||
// Clean out elements have "share" in their id/class combinations from final top candidates,
|
||||
// which means we don't remove the top candidates even they have "share".
|
||||
|
@ -1227,6 +1263,22 @@ class Readability
|
|||
}
|
||||
}
|
||||
|
||||
// Remove single-cell tables
|
||||
foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) {
|
||||
/** @var DOMNode $table */
|
||||
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->childNodes[0] : $table;
|
||||
if ($tbody->hasSingleTagInsideElement('tr')) {
|
||||
$row = $tbody->firstChild;
|
||||
if ($row->hasSingleTagInsideElement('td')) {
|
||||
$cell = $row->firstChild;
|
||||
$cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function ($carry, $node) {
|
||||
return $node->isPhrasingContent() && $carry;
|
||||
}, true)) ? 'p' : 'div');
|
||||
$table->parentNode->replaceChild($cell, $table);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $article;
|
||||
}
|
||||
|
||||
|
@ -1374,6 +1426,7 @@ class Readability
|
|||
|
||||
/**
|
||||
* @param DOMDocument $article
|
||||
* @param string $tag Tag to clean conditionally
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
|
@ -1398,7 +1451,9 @@ class Readability
|
|||
$node = $DOMNodeList->item($length - 1 - $i);
|
||||
|
||||
// First check if we're in a data table, in which case don't remove us.
|
||||
if ($node->hasAncestorTag($node, 'table', -1) && $node->isReadabilityDataTable()) {
|
||||
if ($node->hasAncestorTag('table', -1, function ($node) {
|
||||
return $node->isReadabilityDataTable();
|
||||
})) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -1439,10 +1494,10 @@ class Readability
|
|||
$contentLength = mb_strlen($node->getTextContent(true));
|
||||
|
||||
$haveToRemove =
|
||||
($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag($node, 'figure')) ||
|
||||
($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) ||
|
||||
(!$isList && $li > $p) ||
|
||||
($input > floor($p / 3)) ||
|
||||
(!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag($node, 'figure')) ||
|
||||
(!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) ||
|
||||
(!$isList && $weight < 25 && $linkDensity > 0.2) ||
|
||||
($weight >= 25 && $linkDensity > 0.5) ||
|
||||
(($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
|
||||
|
@ -1477,7 +1532,7 @@ class Readability
|
|||
// Allow youtube and vimeo videos through as people usually want to see those.
|
||||
if ($isEmbed) {
|
||||
$attributeValues = [];
|
||||
foreach ($item->attributes as $name => $value) {
|
||||
foreach ($item->attributes as $value) {
|
||||
$attributeValues[] = $value->nodeValue;
|
||||
}
|
||||
$attributeValues = implode('|', $attributeValues);
|
||||
|
|
Loading…
Reference in New Issue