diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b09c9c6..237c7a6 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -22,7 +22,7 @@ jobs:
strategy:
matrix:
- php: ['8.1', '8.2', '8.3', '8.4']
+ php: ['8.4']
libxml: ['2.9.14']
# Steps represent a sequence of tasks that will be executed as part of the job
diff --git a/composer.json b/composer.json
index c31fc29..fb682c3 100644
--- a/composer.json
+++ b/composer.json
@@ -27,12 +27,11 @@
"psr-4": {"fivefilters\\Readability\\Test\\": "test"}
},
"require": {
- "php": ">=8.1",
+ "php": ">=8.4",
"ext-dom": "*",
"ext-xml": "*",
"ext-mbstring": "*",
"psr/log": "^1.0 || ^2.0 || ^3.0",
- "masterminds/html5": "^2.0",
"league/uri": "^7.0"
},
"require-dev": {
diff --git a/src/Configuration.php b/src/Configuration.php
index c4fe88f..52222d3 100644
--- a/src/Configuration.php
+++ b/src/Configuration.php
@@ -20,7 +20,6 @@ class Configuration
protected bool $cleanConditionally = true;
protected bool $weightClasses = true;
protected bool $fixRelativeURLs = false;
- protected bool $substituteEntities = false;
protected bool $normalizeEntities = false;
protected bool $summonCthulhu = false;
protected string $originalURL = 'http://fakehost';
@@ -206,24 +205,6 @@ public function setFixRelativeURLs(bool $fixRelativeURLs): Configuration
return $this;
}
- /**
- * Get substitute entities.
- */
- public function getSubstituteEntities(): bool
- {
- return $this->substituteEntities;
- }
-
- /**
- * Set substitute entities.
- */
- public function setSubstituteEntities(bool $substituteEntities): Configuration
- {
- $this->substituteEntities = $substituteEntities;
-
- return $this;
- }
-
/**
* Get normalize entities.
*/
@@ -273,7 +254,11 @@ public function getParser(): string
*/
public function setParser(string $parser): Configuration
{
- $this->parser = $parser;
+ if ($parser !== 'html5') {
+ throw new \InvalidArgumentException('This version of Readability.php only supports the HTML5 parser introduced in PHP 8.4');
+ } else {
+ $this->parser = $parser;
+ }
return $this;
}
diff --git a/src/Nodes/DOM/DOMAttr.php b/src/Nodes/DOM/Attr.php
similarity index 79%
rename from src/Nodes/DOM/DOMAttr.php
rename to src/Nodes/DOM/Attr.php
index 1bdf395..ce03daf 100644
--- a/src/Nodes/DOM/DOMAttr.php
+++ b/src/Nodes/DOM/Attr.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMAttr extends \DOMAttr
+class Attr extends \DOM\Attr
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMNotation.php b/src/Nodes/DOM/CdataSection.php
similarity index 73%
rename from src/Nodes/DOM/DOMNotation.php
rename to src/Nodes/DOM/CdataSection.php
index d276e42..39bc0ce 100644
--- a/src/Nodes/DOM/DOMNotation.php
+++ b/src/Nodes/DOM/CdataSection.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMNotation extends \DOMNotation
+class CdataSection extends \DOM\CdataSection
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMCdataSection.php b/src/Nodes/DOM/CharacterData.php
similarity index 72%
rename from src/Nodes/DOM/DOMCdataSection.php
rename to src/Nodes/DOM/CharacterData.php
index 6ac3dcd..ac91426 100644
--- a/src/Nodes/DOM/DOMCdataSection.php
+++ b/src/Nodes/DOM/CharacterData.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMCdataSection extends \DOMCdataSection
+class CharacterData extends \DOM\CharacterData
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMEntity.php b/src/Nodes/DOM/Comment.php
similarity index 77%
rename from src/Nodes/DOM/DOMEntity.php
rename to src/Nodes/DOM/Comment.php
index 751b59c..7415696 100644
--- a/src/Nodes/DOM/DOMEntity.php
+++ b/src/Nodes/DOM/Comment.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMEntity extends \DOMEntity
+class Comment extends \DOM\Comment
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMDocument.php b/src/Nodes/DOM/DOMDocument.php
deleted file mode 100644
index d912338..0000000
--- a/src/Nodes/DOM/DOMDocument.php
+++ /dev/null
@@ -1,30 +0,0 @@
-registerNodeClass('DOMAttr', DOMAttr::class);
- $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class);
- $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class);
- $this->registerNodeClass('DOMComment', DOMComment::class);
- $this->registerNodeClass('DOMDocument', self::class);
- $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class);
- $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class);
- $this->registerNodeClass('DOMElement', DOMElement::class);
- $this->registerNodeClass('DOMEntity', DOMEntity::class);
- $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class);
- $this->registerNodeClass('DOMNode', DOMNode::class);
- $this->registerNodeClass('DOMNotation', DOMNotation::class);
- $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class);
- $this->registerNodeClass('DOMText', DOMText::class);
- }
-}
diff --git a/src/Nodes/DOM/DOMDocumentFragment.php b/src/Nodes/DOM/DOMDocumentFragment.php
deleted file mode 100644
index 33a3f95..0000000
--- a/src/Nodes/DOM/DOMDocumentFragment.php
+++ /dev/null
@@ -1,10 +0,0 @@
-childNodes as $node) {
if ($node->nodeType === XML_ELEMENT_NODE) {
$newList->add($node);
@@ -29,7 +29,7 @@ public function children(): DOMNodeList
*
* @deprecated Use previousElementSibling instead - introduced in PHP 8.0.
*/
- public function previousElementSibling(): ?DOMElement
+ public function previousElementSibling(): ?Element
{
return $this->previousElementSibling;
}
diff --git a/src/Nodes/DOM/DOMComment.php b/src/Nodes/DOM/Entity.php
similarity index 76%
rename from src/Nodes/DOM/DOMComment.php
rename to src/Nodes/DOM/Entity.php
index 3b691f4..721be70 100644
--- a/src/Nodes/DOM/DOMComment.php
+++ b/src/Nodes/DOM/Entity.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMComment extends \DOMComment
+class Entity extends \DOM\Entity
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMCharacterData.php b/src/Nodes/DOM/EntityReference.php
similarity index 70%
rename from src/Nodes/DOM/DOMCharacterData.php
rename to src/Nodes/DOM/EntityReference.php
index b196979..286e3f6 100644
--- a/src/Nodes/DOM/DOMCharacterData.php
+++ b/src/Nodes/DOM/EntityReference.php
@@ -4,7 +4,7 @@
use fivefilters\Readability\Nodes\NodeTrait;
-class DOMCharacterData extends \DOMCharacterData
+class EntityReference extends \DOM\EntityReference
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMNode.php b/src/Nodes/DOM/Node.php
similarity index 86%
rename from src/Nodes/DOM/DOMNode.php
rename to src/Nodes/DOM/Node.php
index 4a3ab0d..e879546 100644
--- a/src/Nodes/DOM/DOMNode.php
+++ b/src/Nodes/DOM/Node.php
@@ -8,7 +8,7 @@
* @method getAttribute($attribute)
* @method hasAttribute($attribute)
*/
-class DOMNode extends \DOMNode
+class Node extends \DOM\Node
{
use NodeTrait;
}
diff --git a/src/Nodes/DOM/DOMNodeList.php b/src/Nodes/DOM/NodeList.php
similarity index 86%
rename from src/Nodes/DOM/DOMNodeList.php
rename to src/Nodes/DOM/NodeList.php
index 2b34e22..09d2df7 100644
--- a/src/Nodes/DOM/DOMNodeList.php
+++ b/src/Nodes/DOM/NodeList.php
@@ -3,7 +3,7 @@
namespace fivefilters\Readability\Nodes\DOM;
/**
- * Class DOMNodeList.
+ * Class NodeList.
*
* This is a fake DOMNodeList class that allows adding items to the list. The original class is static and the nodes
* are defined automagically when instantiating it. This fake version behaves exactly the same way but adds the function
@@ -12,7 +12,7 @@
* It cannot extend the original DOMNodeList class because the functionality behind the property ->length is hidden
* from the user and cannot be extended, changed, or tweaked.
*/
-class DOMNodeList implements \Countable, \IteratorAggregate
+class NodeList implements \Countable, \IteratorAggregate
{
/**
* @var array
@@ -42,7 +42,7 @@ public function __get($name)
/**
* Add node to the list.
*/
- public function add(DOMNode|DOMElement|DOMText|DOMComment $node): DOMNodeList
+ public function add(Node|Element|Text|Comment $node): NodeList
{
$this->items[] = $node;
$this->length++;
@@ -53,7 +53,7 @@ public function add(DOMNode|DOMElement|DOMText|DOMComment $node): DOMNodeList
/**
* Get node.
*/
- public function item(int $offset): DOMNode|DOMElement|DOMText|DOMComment
+ public function item(int $offset): Node|Element|Text|Comment
{
return $this->items[$offset];
}
diff --git a/src/Nodes/DOM/Notation.php b/src/Nodes/DOM/Notation.php
new file mode 100644
index 0000000..8a50f5c
--- /dev/null
+++ b/src/Nodes/DOM/Notation.php
@@ -0,0 +1,10 @@
+attributes)) {
- return parent::getAttribute($attributeName);
+ if ($this instanceof \Dom\HtmlElement) {
+ return parent::getAttribute($attributeName) ?? '';
}
return '';
@@ -166,7 +165,7 @@ public function getAttribute(string $attributeName): string
*/
public function hasAttribute(string $attributeName): bool
{
- if (!is_null($this->attributes)) {
+ if ($this instanceof \Dom\HtmlElement) {
return parent::hasAttribute($attributeName);
}
@@ -185,7 +184,7 @@ public function getNodeAncestors(int|bool $maxLevel = 3): array
$node = $this->parentNode;
- while ($node && !($node instanceof DOMDocument)) {
+ while ($node && !($node instanceof \Dom\HtmlDocument)) {
$ancestors[] = $node;
$level++;
if ($level === $maxLevel) {
@@ -221,7 +220,7 @@ public function getLinkDensity(): float
$links = $this->getAllLinks();
if ($links) {
- /** @var DOMElement $link */
+ /** @var Element $link */
foreach ($links as $link) {
$href = $link->getAttribute('href');
$coefficient = ($href && preg_match(NodeUtility::$regexps['hashUrl'], $href)) ? 0.3 : 1;
@@ -273,12 +272,12 @@ public function getClassWeight(): int
*/
public function getTextContent(bool $normalize = true): string
{
- $nodeValue = trim($this->textContent);
+ $textContent = mb_trim($this->textContent);
if ($normalize) {
- $nodeValue = preg_replace(NodeUtility::$regexps['normalize'], ' ', $nodeValue);
+ $textContent = preg_replace(NodeUtility::$regexps['normalize'], ' ', $textContent);
}
- return $nodeValue;
+ return $textContent;
}
/**
@@ -289,7 +288,7 @@ public function getRowAndColumnCount(): array
$rows = $columns = 0;
$trs = $this->getElementsByTagName('tr');
foreach ($trs as $tr) {
- /** @var \DOMElement $tr */
+ /** @var \DOM\Element $tr */
$rowspan = $tr->getAttribute('rowspan');
$rows += ($rowspan || 1);
@@ -297,7 +296,7 @@ public function getRowAndColumnCount(): array
$columnsInThisRow = 0;
$cells = $tr->getElementsByTagName('td');
foreach ($cells as $cell) {
- /** @var \DOMElement $cell */
+ /** @var \DOM\Element $cell */
$colspan = $cell->getAttribute('colspan');
$columnsInThisRow += ($colspan || 1);
}
@@ -310,10 +309,11 @@ public function getRowAndColumnCount(): array
/**
* Creates a new node based on the text content of the original node.
*/
- public function createNode(DOMNode $originalNode, string $tagName): DOMElement
+ public function createNode(Node $originalNode, string $tagName): Element
{
$text = $originalNode->getTextContent(false);
- $newNode = $originalNode->ownerDocument->createElement($tagName, $text);
+ $newNode = $originalNode->ownerDocument->createElement($tagName);
+ $newNode->appendChild($originalNode->ownerDocument->createTextNode($text));
return $newNode;
}
@@ -367,18 +367,18 @@ public function hasSingleTagInsideElement(string $tag): bool
/**
* Check if the current element has a single child block element.
- * Block elements are the ones defined in the divToPElements array.
+ * Block elements are the ones defined in the DIV_TO_P_ELEMENTS array.
*/
public function hasSingleChildBlockElement(): bool
{
$result = false;
if ($this->hasChildNodes()) {
foreach ($this->childNodes as $child) {
- if (in_array($child->nodeName, $this->divToPElements)) {
+ if (in_array($child->nodeName, self::DIV_TO_P_ELEMENTS)) {
$result = true;
} else {
// If any of the hasSingleChildBlockElement calls return true, return true then.
- /** @var $child DOMElement */
+ /** @var $child Element */
$result = ($result || $child->hasSingleChildBlockElement());
}
}
@@ -392,8 +392,8 @@ public function hasSingleChildBlockElement(): bool
*/
public function isElementWithoutContent(): bool
{
- return $this instanceof DOMElement &&
- mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 &&
+ return $this instanceof Element &&
+ mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent) ?? '') === 0 &&
($this->childNodes->length === 0 ||
$this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length
/*
@@ -405,7 +405,7 @@ public function isElementWithoutContent(): bool
* mb_strlen in this chain of checks).
*/
+ count(array_filter(iterator_to_array($this->childNodes), function ($child) {
- return $child instanceof DOMText;
+ return $child instanceof Text;
}))
);
@@ -417,7 +417,7 @@ public function isElementWithoutContent(): bool
*/
public function isPhrasingContent(): bool
{
- return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false ||
+ return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, self::PHRASING_ELEMS) !== false ||
(!is_null($this->childNodes) &&
($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') &&
array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) {
@@ -443,7 +443,7 @@ public function isProbablyVisible(): bool
*/
public function isWhitespace(): bool
{
- return ($this->nodeType === XML_TEXT_NODE && $this->isWhitespaceInElementContent()) ||
+ return ($this->nodeType === XML_TEXT_NODE && mb_strlen(mb_trim($this->textContent)) === 0) ||
($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br');
}
@@ -487,9 +487,9 @@ public function shiftingAwareGetElementsByTagName(string $tag): \Generator
/**
* Git first element child or null
*/
- public function getFirstElementChild(): ?DOMElement
+ public function getFirstElementChild(): ?Element
{
- if ($this->nodeType === XML_ELEMENT_NODE || $this->nodeType === XML_DOCUMENT_NODE) {
+ if ($this->nodeType === XML_ELEMENT_NODE || $this->nodeType === XML_HTML_DOCUMENT_NODE) {
return $this->firstElementChild;
}
diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php
index 2f094f1..90cc079 100644
--- a/src/Nodes/NodeUtility.php
+++ b/src/Nodes/NodeUtility.php
@@ -2,12 +2,12 @@
namespace fivefilters\Readability\Nodes;
-use fivefilters\Readability\Nodes\DOM\DOMDocument;
-use fivefilters\Readability\Nodes\DOM\DOMElement;
-use fivefilters\Readability\Nodes\DOM\DOMNode;
-use fivefilters\Readability\Nodes\DOM\DOMText;
-use fivefilters\Readability\Nodes\DOM\DOMComment;
-use fivefilters\Readability\Nodes\DOM\DOMNodeList;
+use fivefilters\Readability\Nodes\DOM;
+use fivefilters\Readability\Nodes\DOM\Element;
+use fivefilters\Readability\Nodes\DOM\Node;
+use fivefilters\Readability\Nodes\DOM\Text;
+use fivefilters\Readability\Nodes\DOM\Comment;
+use fivefilters\Readability\Nodes\DOM\NodeList;
/**
* Class NodeUtility.
@@ -52,7 +52,7 @@ class NodeUtility
*
* Imported from the Element class on league\html-to-markdown.
*/
- public static function nextNode(DOMNode|DOMComment|DOMText|DOMElement|null $node): DOMNode|DOMComment|DOMText|DOMElement|null
+ public static function nextNode(Node|Comment|Text|Element|null $node): Node|Comment|Text|Element|null
{
$next = $node;
while ($next
@@ -65,39 +65,17 @@ public static function nextNode(DOMNode|DOMComment|DOMText|DOMElement|null $node
}
/**
- * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
- * element with the new tag name and importing it to the main DOMDocument.
+ * Not in the DOM spec, but PHP 8.4 introduced rename() for DOM\Element and DOM\Attr
*/
- public static function setNodeTag(DOMNode|DOMElement $node, string $value, bool $importAttributes = true): DOMNode|DOMElement
+ public static function setNodeTag(Element $element, string $newName): void
{
- $new = new DOMDocument('1.0', 'utf-8');
- $new->appendChild($new->createElement($value));
-
- $children = $node->childNodes;
- /** @var $children \DOMNodeList $i */
- for ($i = 0; $i < $children->length; $i++) {
- $import = $new->importNode($children->item($i), true);
- $new->firstChild->appendChild($import);
- }
-
- if ($importAttributes) {
- // Import attributes from the original node.
- foreach ($node->attributes as $attribute) {
- $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue);
- }
- }
-
- // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement.
- $import = $node->ownerDocument->importNode($new->firstChild, true);
- $node->parentNode->replaceChild($import, $node);
-
- return $import;
+ $element->rename($element->namespaceURI, $newName);
}
/**
* Removes the current node and returns the next node to be parsed (child, sibling or parent).
*/
- public static function removeAndGetNext(DOMNode|DOMComment|DOMText|DOMElement $node): DOMNode|DOMComment|DOMText|DOMElement|null
+ public static function removeAndGetNext(Node|Comment|Text|Element $node): Node|Comment|Text|Element|null
{
$nextNode = self::getNextNode($node, true);
$node->parentNode->removeChild($node);
@@ -108,7 +86,7 @@ public static function removeAndGetNext(DOMNode|DOMComment|DOMText|DOMElement $n
/**
* Remove the selected node.
*/
- public static function removeNode(DOMNode|DOMComment|DOMText|DOMElement $node): void
+ public static function removeNode(Node|Comment|Text|Element $node): void
{
$parent = $node->parentNode;
if ($parent) {
@@ -120,7 +98,7 @@ public static function removeNode(DOMNode|DOMComment|DOMText|DOMElement $node):
* Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally
* for parents.
*/
- public static function getNextNode(DOMNode|DOMComment|DOMText|DOMElement|DOMDocument $originalNode, bool $ignoreSelfAndKids = false): DOMNode|DOMComment|DOMText|DOMElement|DOMDocument|null
+ public static function getNextNode(Node|Comment|Text|Element|\Dom\HtmlDocument $originalNode, bool $ignoreSelfAndKids = false): Node|Comment|Text|Element|\Dom\HtmlDocument|null
{
/*
* Traverse the DOM from node to node, starting at the node passed in.
@@ -153,15 +131,34 @@ public static function getNextNode(DOMNode|DOMComment|DOMText|DOMElement|DOMDocu
/**
* Remove all empty DOMNodes from DOMNodeLists.
*/
- public static function filterTextNodes(\DOMNodeList $list): DOMNodeList
+ public static function filterTextNodes(\Dom\NodeList $list): NodeList
{
- $newList = new DOMNodeList();
+ $newList = new NodeList();
foreach ($list as $node) {
- if ($node->nodeType !== XML_TEXT_NODE || !$node->isWhitespaceInElementContent()) {
+ if ($node->nodeType !== XML_TEXT_NODE || mb_trim($node->nodeValue) !== '') {
$newList->add($node);
}
}
return $newList;
}
+
+ public static function registerReadabilityNodeClasses(\DOM\HtmlDocument $dom): void
+ {
+ $dom->registerNodeClass('DOM\HtmlElement', DOM\Element::class);
+ $dom->registerNodeClass('DOM\Attr', DOM\Attr::class);
+ $dom->registerNodeClass('DOM\CdataSection', DOM\CdataSection::class);
+ $dom->registerNodeClass('DOM\CharacterData', DOM\CharacterData::class);
+ $dom->registerNodeClass('DOM\Comment', DOM\Comment::class);
+ //$dom->registerNodeClass('DOM\HtmlDocument', DOM\HtmlDocument::class);
+ $dom->registerNodeClass('DOM\DocumentFragment', DOM\DocumentFragment::class);
+ $dom->registerNodeClass('DOM\DocumentType', DOM\DocumentType::class);
+ $dom->registerNodeClass('DOM\Element', DOM\Element::class);
+ $dom->registerNodeClass('DOM\Entity', DOM\Entity::class);
+ $dom->registerNodeClass('DOM\EntityReference', DOM\EntityReference::class);
+ $dom->registerNodeClass('DOM\Node', DOM\Node::class);
+ $dom->registerNodeClass('DOM\Notation', DOM\Notation::class);
+ $dom->registerNodeClass('DOM\ProcessingInstruction', DOM\ProcessingInstruction::class);
+ $dom->registerNodeClass('DOM\Text', DOM\Text::class);
+ }
}
diff --git a/src/Readability.php b/src/Readability.php
index c37c9b6..2599a23 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -2,14 +2,12 @@
namespace fivefilters\Readability;
-use fivefilters\Readability\Nodes\DOM\DOMDocument;
-use fivefilters\Readability\Nodes\DOM\DOMElement;
-use fivefilters\Readability\Nodes\DOM\DOMNode;
-use fivefilters\Readability\Nodes\DOM\DOMText;
-use fivefilters\Readability\Nodes\DOM\DOMComment;
+use fivefilters\Readability\Nodes\DOM\Element;
+use fivefilters\Readability\Nodes\DOM\Node;
+use fivefilters\Readability\Nodes\DOM\Text;
+use fivefilters\Readability\Nodes\DOM\Comment;
use fivefilters\Readability\Nodes\NodeUtility;
use Psr\Log\LoggerInterface;
-use Masterminds\HTML5;
use League\Uri\BaseUri;
/**
@@ -18,9 +16,9 @@
class Readability
{
/**
- * Main DOMDocument where all the magic happens.
+ * Main HtmlDocument where all the magic happens.
*/
- protected DOMDocument $dom;
+ protected \Dom\HtmlDocument $dom;
/**
* Title of the article.
@@ -28,9 +26,9 @@ class Readability
protected ?string $title = null;
/**
- * Final DOMDocument with the fully parsed HTML.
+ * Final HtmlDocument with the fully parsed HTML.
*/
- protected ?DOMDocument $content = null;
+ protected ?\Dom\HtmlDocument $content = null;
/**
* Excerpt of the article.
@@ -59,7 +57,7 @@ class Readability
/**
* Base URI
- * HTML5PHP doesn't appear to store it in the baseURI property like PHP's DOMDocument does when parsing with libxml
+ * HTML5PHP doesn't appear to store it in the baseURI property like PHP's HtmlDocument does when parsing with libxml
*/
protected ?string $baseURI = null;
@@ -162,6 +160,7 @@ public function parse(?string $html = null): bool
throw new ParseException('Invalid or incomplete HTML.');
}
+ $root = $this->dom->getElementsByTagName('body')->item(0);
$bodyCache = $root->cloneNode(true);
@@ -170,12 +169,11 @@ public function parse(?string $html = null): bool
$this->getMainImage();
while (true) {
-
- $this->logger->debug('Starting parse loop');
+ $this->logger->debug('Starting parse loop (#' . count($this->attempts) . ')');
//$root = $root->firstChild;
$elementsToScore = $this->getNodes($root->firstChild);
- $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore)));
+ $this->logger->debug(sprintf('Elements to score: %d', count($elementsToScore)));
$result = $this->rateNodes($elementsToScore);
@@ -187,8 +185,7 @@ public function parse(?string $html = null): bool
* finding the -right- content.
*/
- $length = !$result ? 0 : mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));
-
+ $length = !$result ? 0 : mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->documentElement->textContent));
$this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold()));
if ($result && $length < $this->configuration->getCharThreshold()) {
@@ -271,14 +268,15 @@ public function loadHTML(string $html): void
$this->logger->debug('[Loading] Loading HTML...');
// To avoid throwing a gazillion of errors on malformed HTMLs
- libxml_use_internal_errors(true);
+ //libxml_use_internal_errors(true);
//$html = preg_replace('/(
]*>[ \n\r\t]*){2,}/i', '
', $html); if ($this->configuration->getParser() === 'html5') { $this->logger->debug('[Loading] Using HTML5 parser...'); - $html5 = new HTML5(['disable_html_ns' => true, 'target_document' => new DOMDocument('1.0', 'utf-8')]); - $dom = $html5->loadHTML($html); + // New DOM class with HTML5 parser introduced in PHP 8.4 + $dom = \Dom\HtmlDocument::createFromString($html, LIBXML_NOERROR); + NodeUtility::registerReadabilityNodeClasses($dom); //TODO: Improve this so it looks inside