diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b09c9c6..237c7a6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -22,7 +22,7 @@ jobs: strategy: matrix: - php: ['8.1', '8.2', '8.3', '8.4'] + php: ['8.4'] libxml: ['2.9.14'] # Steps represent a sequence of tasks that will be executed as part of the job diff --git a/composer.json b/composer.json index c31fc29..fb682c3 100644 --- a/composer.json +++ b/composer.json @@ -27,12 +27,11 @@ "psr-4": {"fivefilters\\Readability\\Test\\": "test"} }, "require": { - "php": ">=8.1", + "php": ">=8.4", "ext-dom": "*", "ext-xml": "*", "ext-mbstring": "*", "psr/log": "^1.0 || ^2.0 || ^3.0", - "masterminds/html5": "^2.0", "league/uri": "^7.0" }, "require-dev": { diff --git a/src/Configuration.php b/src/Configuration.php index c4fe88f..52222d3 100644 --- a/src/Configuration.php +++ b/src/Configuration.php @@ -20,7 +20,6 @@ class Configuration protected bool $cleanConditionally = true; protected bool $weightClasses = true; protected bool $fixRelativeURLs = false; - protected bool $substituteEntities = false; protected bool $normalizeEntities = false; protected bool $summonCthulhu = false; protected string $originalURL = 'http://fakehost'; @@ -206,24 +205,6 @@ public function setFixRelativeURLs(bool $fixRelativeURLs): Configuration return $this; } - /** - * Get substitute entities. - */ - public function getSubstituteEntities(): bool - { - return $this->substituteEntities; - } - - /** - * Set substitute entities. - */ - public function setSubstituteEntities(bool $substituteEntities): Configuration - { - $this->substituteEntities = $substituteEntities; - - return $this; - } - /** * Get normalize entities. */ @@ -273,7 +254,11 @@ public function getParser(): string */ public function setParser(string $parser): Configuration { - $this->parser = $parser; + if ($parser !== 'html5') { + throw new \InvalidArgumentException('This version of Readability.php only supports the HTML5 parser introduced in PHP 8.4'); + } else { + $this->parser = $parser; + } return $this; } diff --git a/src/Nodes/DOM/DOMAttr.php b/src/Nodes/DOM/Attr.php similarity index 79% rename from src/Nodes/DOM/DOMAttr.php rename to src/Nodes/DOM/Attr.php index 1bdf395..ce03daf 100644 --- a/src/Nodes/DOM/DOMAttr.php +++ b/src/Nodes/DOM/Attr.php @@ -4,7 +4,7 @@ use fivefilters\Readability\Nodes\NodeTrait; -class DOMAttr extends \DOMAttr +class Attr extends \DOM\Attr { use NodeTrait; } diff --git a/src/Nodes/DOM/DOMNotation.php b/src/Nodes/DOM/CdataSection.php similarity index 73% rename from src/Nodes/DOM/DOMNotation.php rename to src/Nodes/DOM/CdataSection.php index d276e42..39bc0ce 100644 --- a/src/Nodes/DOM/DOMNotation.php +++ b/src/Nodes/DOM/CdataSection.php @@ -4,7 +4,7 @@ use fivefilters\Readability\Nodes\NodeTrait; -class DOMNotation extends \DOMNotation +class CdataSection extends \DOM\CdataSection { use NodeTrait; } diff --git a/src/Nodes/DOM/DOMCdataSection.php b/src/Nodes/DOM/CharacterData.php similarity index 72% rename from src/Nodes/DOM/DOMCdataSection.php rename to src/Nodes/DOM/CharacterData.php index 6ac3dcd..ac91426 100644 --- a/src/Nodes/DOM/DOMCdataSection.php +++ b/src/Nodes/DOM/CharacterData.php @@ -4,7 +4,7 @@ use fivefilters\Readability\Nodes\NodeTrait; -class DOMCdataSection extends \DOMCdataSection +class CharacterData extends \DOM\CharacterData { use NodeTrait; } diff --git a/src/Nodes/DOM/DOMEntity.php b/src/Nodes/DOM/Comment.php similarity index 77% rename from src/Nodes/DOM/DOMEntity.php rename to src/Nodes/DOM/Comment.php index 751b59c..7415696 100644 --- a/src/Nodes/DOM/DOMEntity.php +++ b/src/Nodes/DOM/Comment.php @@ -4,7 +4,7 @@ use fivefilters\Readability\Nodes\NodeTrait; -class DOMEntity extends \DOMEntity +class Comment extends \DOM\Comment { use NodeTrait; } diff --git a/src/Nodes/DOM/DOMDocument.php b/src/Nodes/DOM/DOMDocument.php deleted file mode 100644 index d912338..0000000 --- a/src/Nodes/DOM/DOMDocument.php +++ /dev/null @@ -1,30 +0,0 @@ -registerNodeClass('DOMAttr', DOMAttr::class); - $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class); - $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class); - $this->registerNodeClass('DOMComment', DOMComment::class); - $this->registerNodeClass('DOMDocument', self::class); - $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class); - $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class); - $this->registerNodeClass('DOMElement', DOMElement::class); - $this->registerNodeClass('DOMEntity', DOMEntity::class); - $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class); - $this->registerNodeClass('DOMNode', DOMNode::class); - $this->registerNodeClass('DOMNotation', DOMNotation::class); - $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class); - $this->registerNodeClass('DOMText', DOMText::class); - } -} diff --git a/src/Nodes/DOM/DOMDocumentFragment.php b/src/Nodes/DOM/DOMDocumentFragment.php deleted file mode 100644 index 33a3f95..0000000 --- a/src/Nodes/DOM/DOMDocumentFragment.php +++ /dev/null @@ -1,10 +0,0 @@ -childNodes as $node) { if ($node->nodeType === XML_ELEMENT_NODE) { $newList->add($node); @@ -29,7 +29,7 @@ public function children(): DOMNodeList * * @deprecated Use previousElementSibling instead - introduced in PHP 8.0. */ - public function previousElementSibling(): ?DOMElement + public function previousElementSibling(): ?Element { return $this->previousElementSibling; } diff --git a/src/Nodes/DOM/DOMComment.php b/src/Nodes/DOM/Entity.php similarity index 76% rename from src/Nodes/DOM/DOMComment.php rename to src/Nodes/DOM/Entity.php index 3b691f4..721be70 100644 --- a/src/Nodes/DOM/DOMComment.php +++ b/src/Nodes/DOM/Entity.php @@ -4,7 +4,7 @@ use fivefilters\Readability\Nodes\NodeTrait; -class DOMComment extends \DOMComment +class Entity extends \DOM\Entity { use NodeTrait; } diff --git a/src/Nodes/DOM/DOMCharacterData.php b/src/Nodes/DOM/EntityReference.php similarity index 70% rename from src/Nodes/DOM/DOMCharacterData.php rename to src/Nodes/DOM/EntityReference.php index b196979..286e3f6 100644 --- a/src/Nodes/DOM/DOMCharacterData.php +++ b/src/Nodes/DOM/EntityReference.php @@ -4,7 +4,7 @@ use fivefilters\Readability\Nodes\NodeTrait; -class DOMCharacterData extends \DOMCharacterData +class EntityReference extends \DOM\EntityReference { use NodeTrait; } diff --git a/src/Nodes/DOM/DOMNode.php b/src/Nodes/DOM/Node.php similarity index 86% rename from src/Nodes/DOM/DOMNode.php rename to src/Nodes/DOM/Node.php index 4a3ab0d..e879546 100644 --- a/src/Nodes/DOM/DOMNode.php +++ b/src/Nodes/DOM/Node.php @@ -8,7 +8,7 @@ * @method getAttribute($attribute) * @method hasAttribute($attribute) */ -class DOMNode extends \DOMNode +class Node extends \DOM\Node { use NodeTrait; } diff --git a/src/Nodes/DOM/DOMNodeList.php b/src/Nodes/DOM/NodeList.php similarity index 86% rename from src/Nodes/DOM/DOMNodeList.php rename to src/Nodes/DOM/NodeList.php index 2b34e22..09d2df7 100644 --- a/src/Nodes/DOM/DOMNodeList.php +++ b/src/Nodes/DOM/NodeList.php @@ -3,7 +3,7 @@ namespace fivefilters\Readability\Nodes\DOM; /** - * Class DOMNodeList. + * Class NodeList. * * This is a fake DOMNodeList class that allows adding items to the list. The original class is static and the nodes * are defined automagically when instantiating it. This fake version behaves exactly the same way but adds the function @@ -12,7 +12,7 @@ * It cannot extend the original DOMNodeList class because the functionality behind the property ->length is hidden * from the user and cannot be extended, changed, or tweaked. */ -class DOMNodeList implements \Countable, \IteratorAggregate +class NodeList implements \Countable, \IteratorAggregate { /** * @var array @@ -42,7 +42,7 @@ public function __get($name) /** * Add node to the list. */ - public function add(DOMNode|DOMElement|DOMText|DOMComment $node): DOMNodeList + public function add(Node|Element|Text|Comment $node): NodeList { $this->items[] = $node; $this->length++; @@ -53,7 +53,7 @@ public function add(DOMNode|DOMElement|DOMText|DOMComment $node): DOMNodeList /** * Get node. */ - public function item(int $offset): DOMNode|DOMElement|DOMText|DOMComment + public function item(int $offset): Node|Element|Text|Comment { return $this->items[$offset]; } diff --git a/src/Nodes/DOM/Notation.php b/src/Nodes/DOM/Notation.php new file mode 100644 index 0000000..8a50f5c --- /dev/null +++ b/src/Nodes/DOM/Notation.php @@ -0,0 +1,10 @@ +attributes)) { - return parent::getAttribute($attributeName); + if ($this instanceof \Dom\HtmlElement) { + return parent::getAttribute($attributeName) ?? ''; } return ''; @@ -166,7 +165,7 @@ public function getAttribute(string $attributeName): string */ public function hasAttribute(string $attributeName): bool { - if (!is_null($this->attributes)) { + if ($this instanceof \Dom\HtmlElement) { return parent::hasAttribute($attributeName); } @@ -185,7 +184,7 @@ public function getNodeAncestors(int|bool $maxLevel = 3): array $node = $this->parentNode; - while ($node && !($node instanceof DOMDocument)) { + while ($node && !($node instanceof \Dom\HtmlDocument)) { $ancestors[] = $node; $level++; if ($level === $maxLevel) { @@ -221,7 +220,7 @@ public function getLinkDensity(): float $links = $this->getAllLinks(); if ($links) { - /** @var DOMElement $link */ + /** @var Element $link */ foreach ($links as $link) { $href = $link->getAttribute('href'); $coefficient = ($href && preg_match(NodeUtility::$regexps['hashUrl'], $href)) ? 0.3 : 1; @@ -273,12 +272,12 @@ public function getClassWeight(): int */ public function getTextContent(bool $normalize = true): string { - $nodeValue = trim($this->textContent); + $textContent = mb_trim($this->textContent); if ($normalize) { - $nodeValue = preg_replace(NodeUtility::$regexps['normalize'], ' ', $nodeValue); + $textContent = preg_replace(NodeUtility::$regexps['normalize'], ' ', $textContent); } - return $nodeValue; + return $textContent; } /** @@ -289,7 +288,7 @@ public function getRowAndColumnCount(): array $rows = $columns = 0; $trs = $this->getElementsByTagName('tr'); foreach ($trs as $tr) { - /** @var \DOMElement $tr */ + /** @var \DOM\Element $tr */ $rowspan = $tr->getAttribute('rowspan'); $rows += ($rowspan || 1); @@ -297,7 +296,7 @@ public function getRowAndColumnCount(): array $columnsInThisRow = 0; $cells = $tr->getElementsByTagName('td'); foreach ($cells as $cell) { - /** @var \DOMElement $cell */ + /** @var \DOM\Element $cell */ $colspan = $cell->getAttribute('colspan'); $columnsInThisRow += ($colspan || 1); } @@ -310,10 +309,11 @@ public function getRowAndColumnCount(): array /** * Creates a new node based on the text content of the original node. */ - public function createNode(DOMNode $originalNode, string $tagName): DOMElement + public function createNode(Node $originalNode, string $tagName): Element { $text = $originalNode->getTextContent(false); - $newNode = $originalNode->ownerDocument->createElement($tagName, $text); + $newNode = $originalNode->ownerDocument->createElement($tagName); + $newNode->appendChild($originalNode->ownerDocument->createTextNode($text)); return $newNode; } @@ -367,18 +367,18 @@ public function hasSingleTagInsideElement(string $tag): bool /** * Check if the current element has a single child block element. - * Block elements are the ones defined in the divToPElements array. + * Block elements are the ones defined in the DIV_TO_P_ELEMENTS array. */ public function hasSingleChildBlockElement(): bool { $result = false; if ($this->hasChildNodes()) { foreach ($this->childNodes as $child) { - if (in_array($child->nodeName, $this->divToPElements)) { + if (in_array($child->nodeName, self::DIV_TO_P_ELEMENTS)) { $result = true; } else { // If any of the hasSingleChildBlockElement calls return true, return true then. - /** @var $child DOMElement */ + /** @var $child Element */ $result = ($result || $child->hasSingleChildBlockElement()); } } @@ -392,8 +392,8 @@ public function hasSingleChildBlockElement(): bool */ public function isElementWithoutContent(): bool { - return $this instanceof DOMElement && - mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && + return $this instanceof Element && + mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent) ?? '') === 0 && ($this->childNodes->length === 0 || $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length /* @@ -405,7 +405,7 @@ public function isElementWithoutContent(): bool * mb_strlen in this chain of checks). */ + count(array_filter(iterator_to_array($this->childNodes), function ($child) { - return $child instanceof DOMText; + return $child instanceof Text; })) ); @@ -417,7 +417,7 @@ public function isElementWithoutContent(): bool */ public function isPhrasingContent(): bool { - return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false || + return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, self::PHRASING_ELEMS) !== false || (!is_null($this->childNodes) && ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') && array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) { @@ -443,7 +443,7 @@ public function isProbablyVisible(): bool */ public function isWhitespace(): bool { - return ($this->nodeType === XML_TEXT_NODE && $this->isWhitespaceInElementContent()) || + return ($this->nodeType === XML_TEXT_NODE && mb_strlen(mb_trim($this->textContent)) === 0) || ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); } @@ -487,9 +487,9 @@ public function shiftingAwareGetElementsByTagName(string $tag): \Generator /** * Git first element child or null */ - public function getFirstElementChild(): ?DOMElement + public function getFirstElementChild(): ?Element { - if ($this->nodeType === XML_ELEMENT_NODE || $this->nodeType === XML_DOCUMENT_NODE) { + if ($this->nodeType === XML_ELEMENT_NODE || $this->nodeType === XML_HTML_DOCUMENT_NODE) { return $this->firstElementChild; } diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php index 2f094f1..90cc079 100644 --- a/src/Nodes/NodeUtility.php +++ b/src/Nodes/NodeUtility.php @@ -2,12 +2,12 @@ namespace fivefilters\Readability\Nodes; -use fivefilters\Readability\Nodes\DOM\DOMDocument; -use fivefilters\Readability\Nodes\DOM\DOMElement; -use fivefilters\Readability\Nodes\DOM\DOMNode; -use fivefilters\Readability\Nodes\DOM\DOMText; -use fivefilters\Readability\Nodes\DOM\DOMComment; -use fivefilters\Readability\Nodes\DOM\DOMNodeList; +use fivefilters\Readability\Nodes\DOM; +use fivefilters\Readability\Nodes\DOM\Element; +use fivefilters\Readability\Nodes\DOM\Node; +use fivefilters\Readability\Nodes\DOM\Text; +use fivefilters\Readability\Nodes\DOM\Comment; +use fivefilters\Readability\Nodes\DOM\NodeList; /** * Class NodeUtility. @@ -52,7 +52,7 @@ class NodeUtility * * Imported from the Element class on league\html-to-markdown. */ - public static function nextNode(DOMNode|DOMComment|DOMText|DOMElement|null $node): DOMNode|DOMComment|DOMText|DOMElement|null + public static function nextNode(Node|Comment|Text|Element|null $node): Node|Comment|Text|Element|null { $next = $node; while ($next @@ -65,39 +65,17 @@ public static function nextNode(DOMNode|DOMComment|DOMText|DOMElement|null $node } /** - * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new - * element with the new tag name and importing it to the main DOMDocument. + * Not in the DOM spec, but PHP 8.4 introduced rename() for DOM\Element and DOM\Attr */ - public static function setNodeTag(DOMNode|DOMElement $node, string $value, bool $importAttributes = true): DOMNode|DOMElement + public static function setNodeTag(Element $element, string $newName): void { - $new = new DOMDocument('1.0', 'utf-8'); - $new->appendChild($new->createElement($value)); - - $children = $node->childNodes; - /** @var $children \DOMNodeList $i */ - for ($i = 0; $i < $children->length; $i++) { - $import = $new->importNode($children->item($i), true); - $new->firstChild->appendChild($import); - } - - if ($importAttributes) { - // Import attributes from the original node. - foreach ($node->attributes as $attribute) { - $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue); - } - } - - // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement. - $import = $node->ownerDocument->importNode($new->firstChild, true); - $node->parentNode->replaceChild($import, $node); - - return $import; + $element->rename($element->namespaceURI, $newName); } /** * Removes the current node and returns the next node to be parsed (child, sibling or parent). */ - public static function removeAndGetNext(DOMNode|DOMComment|DOMText|DOMElement $node): DOMNode|DOMComment|DOMText|DOMElement|null + public static function removeAndGetNext(Node|Comment|Text|Element $node): Node|Comment|Text|Element|null { $nextNode = self::getNextNode($node, true); $node->parentNode->removeChild($node); @@ -108,7 +86,7 @@ public static function removeAndGetNext(DOMNode|DOMComment|DOMText|DOMElement $n /** * Remove the selected node. */ - public static function removeNode(DOMNode|DOMComment|DOMText|DOMElement $node): void + public static function removeNode(Node|Comment|Text|Element $node): void { $parent = $node->parentNode; if ($parent) { @@ -120,7 +98,7 @@ public static function removeNode(DOMNode|DOMComment|DOMText|DOMElement $node): * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally * for parents. */ - public static function getNextNode(DOMNode|DOMComment|DOMText|DOMElement|DOMDocument $originalNode, bool $ignoreSelfAndKids = false): DOMNode|DOMComment|DOMText|DOMElement|DOMDocument|null + public static function getNextNode(Node|Comment|Text|Element|\Dom\HtmlDocument $originalNode, bool $ignoreSelfAndKids = false): Node|Comment|Text|Element|\Dom\HtmlDocument|null { /* * Traverse the DOM from node to node, starting at the node passed in. @@ -153,15 +131,34 @@ public static function getNextNode(DOMNode|DOMComment|DOMText|DOMElement|DOMDocu /** * Remove all empty DOMNodes from DOMNodeLists. */ - public static function filterTextNodes(\DOMNodeList $list): DOMNodeList + public static function filterTextNodes(\Dom\NodeList $list): NodeList { - $newList = new DOMNodeList(); + $newList = new NodeList(); foreach ($list as $node) { - if ($node->nodeType !== XML_TEXT_NODE || !$node->isWhitespaceInElementContent()) { + if ($node->nodeType !== XML_TEXT_NODE || mb_trim($node->nodeValue) !== '') { $newList->add($node); } } return $newList; } + + public static function registerReadabilityNodeClasses(\DOM\HtmlDocument $dom): void + { + $dom->registerNodeClass('DOM\HtmlElement', DOM\Element::class); + $dom->registerNodeClass('DOM\Attr', DOM\Attr::class); + $dom->registerNodeClass('DOM\CdataSection', DOM\CdataSection::class); + $dom->registerNodeClass('DOM\CharacterData', DOM\CharacterData::class); + $dom->registerNodeClass('DOM\Comment', DOM\Comment::class); + //$dom->registerNodeClass('DOM\HtmlDocument', DOM\HtmlDocument::class); + $dom->registerNodeClass('DOM\DocumentFragment', DOM\DocumentFragment::class); + $dom->registerNodeClass('DOM\DocumentType', DOM\DocumentType::class); + $dom->registerNodeClass('DOM\Element', DOM\Element::class); + $dom->registerNodeClass('DOM\Entity', DOM\Entity::class); + $dom->registerNodeClass('DOM\EntityReference', DOM\EntityReference::class); + $dom->registerNodeClass('DOM\Node', DOM\Node::class); + $dom->registerNodeClass('DOM\Notation', DOM\Notation::class); + $dom->registerNodeClass('DOM\ProcessingInstruction', DOM\ProcessingInstruction::class); + $dom->registerNodeClass('DOM\Text', DOM\Text::class); + } } diff --git a/src/Readability.php b/src/Readability.php index c37c9b6..2599a23 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -2,14 +2,12 @@ namespace fivefilters\Readability; -use fivefilters\Readability\Nodes\DOM\DOMDocument; -use fivefilters\Readability\Nodes\DOM\DOMElement; -use fivefilters\Readability\Nodes\DOM\DOMNode; -use fivefilters\Readability\Nodes\DOM\DOMText; -use fivefilters\Readability\Nodes\DOM\DOMComment; +use fivefilters\Readability\Nodes\DOM\Element; +use fivefilters\Readability\Nodes\DOM\Node; +use fivefilters\Readability\Nodes\DOM\Text; +use fivefilters\Readability\Nodes\DOM\Comment; use fivefilters\Readability\Nodes\NodeUtility; use Psr\Log\LoggerInterface; -use Masterminds\HTML5; use League\Uri\BaseUri; /** @@ -18,9 +16,9 @@ class Readability { /** - * Main DOMDocument where all the magic happens. + * Main HtmlDocument where all the magic happens. */ - protected DOMDocument $dom; + protected \Dom\HtmlDocument $dom; /** * Title of the article. @@ -28,9 +26,9 @@ class Readability protected ?string $title = null; /** - * Final DOMDocument with the fully parsed HTML. + * Final HtmlDocument with the fully parsed HTML. */ - protected ?DOMDocument $content = null; + protected ?\Dom\HtmlDocument $content = null; /** * Excerpt of the article. @@ -59,7 +57,7 @@ class Readability /** * Base URI - * HTML5PHP doesn't appear to store it in the baseURI property like PHP's DOMDocument does when parsing with libxml + * HTML5PHP doesn't appear to store it in the baseURI property like PHP's HtmlDocument does when parsing with libxml */ protected ?string $baseURI = null; @@ -162,6 +160,7 @@ public function parse(?string $html = null): bool throw new ParseException('Invalid or incomplete HTML.'); } + $root = $this->dom->getElementsByTagName('body')->item(0); $bodyCache = $root->cloneNode(true); @@ -170,12 +169,11 @@ public function parse(?string $html = null): bool $this->getMainImage(); while (true) { - - $this->logger->debug('Starting parse loop'); + $this->logger->debug('Starting parse loop (#' . count($this->attempts) . ')'); //$root = $root->firstChild; $elementsToScore = $this->getNodes($root->firstChild); - $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore))); + $this->logger->debug(sprintf('Elements to score: %d', count($elementsToScore))); $result = $this->rateNodes($elementsToScore); @@ -187,8 +185,7 @@ public function parse(?string $html = null): bool * finding the -right- content. */ - $length = !$result ? 0 : mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent)); - + $length = !$result ? 0 : mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->documentElement->textContent)); $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold())); if ($result && $length < $this->configuration->getCharThreshold()) { @@ -271,14 +268,15 @@ public function loadHTML(string $html): void $this->logger->debug('[Loading] Loading HTML...'); // To avoid throwing a gazillion of errors on malformed HTMLs - libxml_use_internal_errors(true); + //libxml_use_internal_errors(true); //$html = preg_replace('/(]*>[ \n\r\t]*){2,}/i', '

', $html); if ($this->configuration->getParser() === 'html5') { $this->logger->debug('[Loading] Using HTML5 parser...'); - $html5 = new HTML5(['disable_html_ns' => true, 'target_document' => new DOMDocument('1.0', 'utf-8')]); - $dom = $html5->loadHTML($html); + // New DOM class with HTML5 parser introduced in PHP 8.4 + $dom = \Dom\HtmlDocument::createFromString($html, LIBXML_NOERROR); + NodeUtility::registerReadabilityNodeClasses($dom); //TODO: Improve this so it looks inside , not just any $base = $dom->getElementsByTagName('base'); if ($base->length > 0) { @@ -289,31 +287,9 @@ public function loadHTML(string $html): void } } } else { - $this->logger->debug('[Loading] Using libxml parser...'); - $dom = new DOMDocument('1.0', 'utf-8'); - if ($this->configuration->getNormalizeEntities()) { - $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.'); - // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content - $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); - } - } - - if (!$this->configuration->getSubstituteEntities()) { - // Keep the original HTML entities - $dom->substituteEntities = false; - } - - if ($this->configuration->getSummonCthulhu()) { - $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘'); - $html = preg_replace('/]*>([\s\S]*?)<\/script>/', '', $html); - } - - // Prepend the XML tag to avoid having issues with special characters. Should be harmless. - if ($this->configuration->getParser() !== 'html5') { - $dom->loadHTML('' . $html); - $this->baseURI = $dom->baseURI; + // Throw exception if the parser is not HTML5 + throw new ParseException('This version of Readability.php only supports the HTML5 parser introduced in PHP 8.4'); } - $dom->encoding = 'UTF-8'; $this->logger->debug('[Loading] Loaded HTML successfully.'); @@ -326,7 +302,7 @@ public function loadHTML(string $html): void * * @return array with any metadata that could be extracted (possibly none) */ - private function getJSONLD(DOMDocument $dom): array + private function getJSONLD(\Dom\HtmlDocument $dom): array { $scripts = $this->_getAllNodesWithTag($dom, ['script']); @@ -418,7 +394,7 @@ private function getMetadata(): void /* @var DOMNode $meta */ $elementName = $meta->getAttribute('name'); $elementProperty = $meta->getAttribute('property'); - $content = $meta->getAttribute('content'); + $content = $meta->getAttribute('content'); $matches = null; $name = null; @@ -597,7 +573,7 @@ public function getMainImage(): void /** * Remove unnecessary nested elements */ - private function simplifyNestedElements(DOMDocument $article): void + private function simplifyNestedElements(\Dom\HtmlDocument $article): void { $node = $article; @@ -635,8 +611,8 @@ private function getArticleTitle(): ?string $this->logger->debug('[Metadata] Could not find title in metadata, searching for the title tag...'); $titleTag = $this->dom->getElementsByTagName('title'); if ($titleTag->length > 0) { - $this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue)); - $originalTitle = $titleTag->item(0)->nodeValue; + $this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->textContent)); + $originalTitle = $titleTag->item(0)->textContent; } } @@ -672,7 +648,7 @@ private function getArticleTitle(): ?string for ($i = 1; $i <= 2; $i++) { foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) { // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs - if (trim($hTag->nodeValue) === trim($curTitle)) { + if (trim($hTag->textContent) === trim($curTitle)) { $match = true; } } @@ -698,7 +674,7 @@ private function getArticleTitle(): ?string $hOnes = $this->dom->getElementsByTagName('h1'); if ($hOnes->length === 1) { - $curTitle = $hOnes->item(0)->nodeValue; + $curTitle = $hOnes->item(0)->textContent; $this->logger->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle)); } } @@ -795,7 +771,7 @@ public function getPathInfo(string $url): array /** * Gets nodes from the root element. */ - private function getNodes(DOMNode|DOMComment|DOMText|DOMElement|null $node): array + private function getNodes(Node|Comment|Text|Element|null $node): array { $this->logger->info('[Get Nodes] Retrieving nodes...'); if ($node === null) { @@ -817,7 +793,7 @@ private function getNodes(DOMNode|DOMComment|DOMText|DOMElement|null $node): arr while ($node) { // Remove DOMComments nodes as we don't need them and mess up children counting if ($node->nodeType === XML_COMMENT_NODE) { - $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); + $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->textContent, 0, 128))); $node = NodeUtility::removeAndGetNext($node); continue; } @@ -832,7 +808,7 @@ private function getNodes(DOMNode|DOMComment|DOMText|DOMElement|null $node): arr // Check to see if this node is a byline, and remove it if it is. if ($this->checkByline($node, $matchString)) { - $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); + $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->textContent, 0, 128))); $node = NodeUtility::removeAndGetNext($node); continue; } @@ -854,7 +830,7 @@ private function getNodes(DOMNode|DOMComment|DOMText|DOMElement|null $node): arr $node->nodeName !== 'body' && $node->nodeName !== 'a' ) { - $this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); + $this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->textContent, 0, 128))); $node = NodeUtility::removeAndGetNext($node); continue; } @@ -878,7 +854,7 @@ private function getNodes(DOMNode|DOMComment|DOMText|DOMElement|null $node): arr } if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) { - $this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue, 0, 128))); + $this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->textContent, 0, 128))); $elementsToScore[] = $node; } @@ -914,14 +890,14 @@ private function getNodes(DOMNode|DOMComment|DOMText|DOMElement|null $node): arr * algorithm with DIVs with are, in practice, paragraphs. */ if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) { - $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); + $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->textContent, 0, 128))); $pNode = NodeUtility::filterTextNodes($node->childNodes)->item(0); $node->parentNode->replaceChild($pNode, $node); $node = $pNode; $elementsToScore[] = $node; } elseif (!$node->hasSingleChildBlockElement()) { - $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); - $node = NodeUtility::setNodeTag($node, 'p'); + $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->textContent, 0, 128))); + NodeUtility::setNodeTag($node, 'p'); $elementsToScore[] = $node; } } @@ -960,7 +936,7 @@ private function textSimilarity(string $textA, string $textB): float /** * Checks if the node is a byline. */ - private function checkByline(DOMNode|DOMText|DOMElement $node, string $matchString): bool + private function checkByline(Node|Text|Element $node, string $matchString): bool { if (!$this->configuration->getArticleByline()) { return false; @@ -1030,7 +1006,7 @@ private function unescapeHtmlEntities(?string $str): ?string * Check if node is image, or if node contains exactly only one image * whether as a direct child or as its descendants. */ - private function isSingleImage(DOMElement|DOMNode|DOMText $node): bool + private function isSingleImage(Element|Node|Text $node): bool { if ($node->tagName === 'img') { return true; @@ -1049,7 +1025,7 @@ private function isSingleImage(DOMElement|DOMNode|DOMText $node): bool * and remove the