From 00405970a56cd8ba7f9e9f29f30fc64b61090267 Mon Sep 17 00:00:00 2001
From: "J. King"
Date: Mon, 25 Oct 2021 18:34:23 -0400
Subject: [PATCH 1/6] Initial inclusion of parser
---
.gitignore | 3 +-
composer.json | 2 +-
src/Nodes/DOM/DOMDocument.php | 2 +-
src/Readability.php | 70 ++++++++++++++-----------
test/test-pages/citylab-1/expected.html | 8 +--
test/test-pages/cnet/expected.html | 4 +-
6 files changed, 48 insertions(+), 41 deletions(-)
diff --git a/.gitignore b/.gitignore
index 52b9f38..69c5f83 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,5 @@
vendor
composer.lock
/test.*
-/test/changed/
\ No newline at end of file
+/test/changed/
+.phpunit.result.cache
diff --git a/composer.json b/composer.json
index aba02cf..576151e 100644
--- a/composer.json
+++ b/composer.json
@@ -32,7 +32,7 @@
"ext-xml": "*",
"ext-mbstring": "*",
"psr/log": "^1.0",
- "masterminds/html5": "^2.0",
+ "mensbeam/html-parser": "^1.1.1",
"league/uri": "^6.4"
},
"require-dev": {
diff --git a/src/Nodes/DOM/DOMDocument.php b/src/Nodes/DOM/DOMDocument.php
index d912338..49a9b5d 100644
--- a/src/Nodes/DOM/DOMDocument.php
+++ b/src/Nodes/DOM/DOMDocument.php
@@ -8,7 +8,7 @@ class DOMDocument extends \DOMDocument
{
use NodeTrait;
- public function __construct($version, $encoding)
+ public function __construct($version = "1.0", $encoding = "")
{
parent::__construct($version, $encoding);
diff --git a/src/Readability.php b/src/Readability.php
index 5c8fb84..1108af4 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -8,9 +8,10 @@
use fivefilters\Readability\Nodes\DOM\DOMText;
use fivefilters\Readability\Nodes\NodeUtility;
use Psr\Log\LoggerInterface;
-use \Masterminds\HTML5;
use League\Uri\Http;
use League\Uri\UriResolver;
+use MensBeam\HTML\Parser;
+use MensBeam\HTML\Parser\Config as ParserConfig;
/**
* Class Readability.
@@ -286,48 +287,51 @@ private function loadHTML($html)
{
$this->logger->debug('[Loading] Loading HTML...');
- // To avoid throwing a gazillion of errors on malformed HTMLs
- libxml_use_internal_errors(true);
-
//$html = preg_replace('/(
]*>[ \n\r\t]*){2,}/i', '
', $html);
if ($this->configuration->getParser() === 'html5') {
$this->logger->debug('[Loading] Using HTML5 parser...');
- $html5 = new HTML5(['disable_html_ns' => true, 'target_document' => new DOMDocument('1.0', 'utf-8')]);
- $dom = $html5->loadHTML($html);
+ $config = new ParserConfig();
+ $config->documentClass = DOMDocument::class;
+ $dom = Parser::parse($html, "", $config)->document;
//TODO: Improve this so it looks inside
, not just any
$base = $dom->getElementsByTagName('base');
if ($base->length > 0) {
- $base = $base->item(0);
- $base = $base->getAttribute('href');
+ $base = $base->item(0)->getAttribute('href');
if ($base != '') {
$this->baseURI = $base;
}
}
} else {
$this->logger->debug('[Loading] Using libxml parser...');
- $dom = new DOMDocument('1.0', 'utf-8');
- if ($this->configuration->getNormalizeEntities()) {
- $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');
- // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
- $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
- }
- }
-
- if (!$this->configuration->getSubstituteEntities()) {
- // Keep the original HTML entities
- $dom->substituteEntities = false;
- }
+ // To avoid throwing a gazillion of errors on malformed HTMLs
+ $libxml_err = libxml_use_internal_errors(true);
+ try {
+ $dom = new DOMDocument('1.0', 'utf-8');
+ if ($this->configuration->getNormalizeEntities()) {
+ $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');
+ // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
+ $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
+ }
+ if (!$this->configuration->getSubstituteEntities()) {
+ // Keep the original HTML entities
+ $dom->substituteEntities = false;
+ }
+ if ($this->configuration->getSummonCthulhu()) {
+ $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
+ $html = preg_replace('/