diff --git a/vendor/simplehtmldom/LICENSE b/vendor/simplehtmldom/LICENSE
new file mode 100644
index 00000000..6040f77b
--- /dev/null
+++ b/vendor/simplehtmldom/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 S.C. Chen, John Schlick, logmanoriginal
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/vendor/simplehtmldom/simple_html_dom.php b/vendor/simplehtmldom/simple_html_dom.php
index c0001e3b..d30b018e 100644
--- a/vendor/simplehtmldom/simple_html_dom.php
+++ b/vendor/simplehtmldom/simple_html_dom.php
@@ -3,64 +3,24 @@
* Website: http://sourceforge.net/projects/simplehtmldom/
* Additional projects: http://sourceforge.net/projects/debugobject/
* Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
- * Contributions by:
- * Yousuke Kumakura (Attribute filters)
- * Vadim Voituk (Negative indexes supports of "find" method)
- * Antcs (Constructor with automatically load contents either text or file/url)
- *
- * all affected sections have comments starting with "PaperG"
- *
- * Paperg - Added case insensitive testing of the value of the selector.
- *
- * Paperg - Added tag_start for the starting index of tags - NOTE: This works
- * but not accurately. This tag_start gets counted AFTER \r\n have been crushed
- * out, and after the remove_noice calls so it will not reflect the REAL
- * position of the tag in the source, it will almost always be smaller by some
- * amount. We use this to determine how far into the file the tag in question
- * is. This "percentage" will never be accurate as the $dom->size is the "real"
- * number of bytes the dom was created from. But for most purposes, it's a
- * really good estimation.
- *
- * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags
- * closed is great for malformed html, but it CAN lead to parsing errors.
- *
- * Allow the user to tell us how much they trust the html.
- *
- * Paperg add the text and plaintext to the selectors for the find syntax.
- * plaintext implies text in the innertext of a node. text implies that the
- * tag is a text node. This allows for us to find tags based on the text they
- * contain.
- *
- * Create find_ancestor_tag to see if a tag is - at any level - inside of
- * another specific tag.
- *
- * Paperg: added parse_charset so that we know about the character set of
- * the source document. NOTE: If the user's system has a routine called
- * get_last_retrieve_url_contents_content_type availalbe, we will assume it's
- * returning the content-type header from the last transfer or curl_exec, and
- * we will parse that and use it in preference to any other method of charset
- * detection.
- *
- * Found infinite loop in the case of broken html in restore_noise. Rewrote to
- * protect from that.
- *
- * PaperG (John Schlick) Added get_display_size for "IMG" tags.
*
* Licensed under The MIT License
- * Redistributions of files must retain the above copyright notice.
+ * See the LICENSE file in the project root for more information.
*
- * @author S.C. Chen
- * @author John Schlick
- * @author Rus Carroll
- * @version Rev. 1.8.1 (247)
- * @package PlaceLocalInclude
- * @subpackage simple_html_dom
+ * Authors:
+ * S.C. Chen
+ * John Schlick
+ * Rus Carroll
+ * logmanoriginal
+ *
+ * Contributors:
+ * Yousuke Kumakura
+ * Vadim Voituk
+ * Antcs
+ *
+ * Version Rev. 1.9 (290)
*/
-/**
- * All of the Defines for the classes below.
- * @author S.C. Chen
- */
define('HDOM_TYPE_ELEMENT', 1);
define('HDOM_TYPE_COMMENT', 2);
define('HDOM_TYPE_TEXT', 3);
@@ -79,25 +39,12 @@ define('HDOM_INFO_INNER', 5);
define('HDOM_INFO_OUTER', 6);
define('HDOM_INFO_ENDSPACE', 7);
-/** The default target charset */
defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
-
-/** The default
text used instead of
tags when returning text */
defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
-
-/** The default text used instead of tags when returning text */
defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
-
-/** The maximum file size the parser should load */
defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
-
-/** Contents between curly braces "{" and "}" are interpreted as text */
define('HDOM_SMARTY_AS_TEXT', 1);
-// helper functions
-// -----------------------------------------------------------------------------
-// get html dom from file
-// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
function file_get_html(
$url,
$use_include_path = false,
@@ -111,10 +58,8 @@ function file_get_html(
$defaultBRText = DEFAULT_BR_TEXT,
$defaultSpanText = DEFAULT_SPAN_TEXT)
{
- // Ensure maximum length is greater than zero
if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
- // We DO force the tags to be terminated.
$dom = new simple_html_dom(
null,
$lowercase,
@@ -122,7 +67,8 @@ function file_get_html(
$target_charset,
$stripRN,
$defaultBRText,
- $defaultSpanText);
+ $defaultSpanText
+ );
/**
* For sourceforge users: uncomment the next line and comment the
@@ -133,19 +79,18 @@ function file_get_html(
$use_include_path,
$context,
$offset,
- $maxLen);
-
- // Paperg - use our own mechanism for getting the contents as we want to
- // control the timeout.
+ $maxLen
+ );
// $contents = retrieve_url_contents($url);
- if (empty($contents) || strlen($contents) > $maxLen) { return false; }
- // The second parameter can force the selectors to all be lowercase.
- $dom->load($contents, $lowercase, $stripRN);
- return $dom;
+ if (empty($contents) || strlen($contents) > $maxLen) {
+ $dom->clear();
+ return false;
+ }
+
+ return $dom->load($contents, $lowercase, $stripRN);
}
-// get html dom from string
function str_get_html(
$str,
$lowercase = true,
@@ -162,97 +107,34 @@ function str_get_html(
$target_charset,
$stripRN,
$defaultBRText,
- $defaultSpanText);
+ $defaultSpanText
+ );
if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
$dom->clear();
return false;
}
- $dom->load($str, $lowercase, $stripRN);
- return $dom;
+ return $dom->load($str, $lowercase, $stripRN);
}
-// dump html dom tree
function dump_html_tree($node, $show_attr = true, $deep = 0)
{
$node->dump($node);
}
-/**
- * simple html dom node
- * PaperG - added ability for "find" routine to lowercase the value of the
- * selector.
- *
- * PaperG - added $tag_start to track the start position of the tag in the total
- * byte index
- *
- * @package PlaceLocalInclude
- */
class simple_html_dom_node
{
- /**
- * Node type
- *
- * Default is {@see HDOM_TYPE_TEXT}
- *
- * @var int
- */
public $nodetype = HDOM_TYPE_TEXT;
-
- /**
- * Tag name
- *
- * Default is 'text'
- *
- * @var string
- */
public $tag = 'text';
-
- /**
- * List of attributes
- *
- * @var array
- */
public $attr = array();
-
- /**
- * List of child node objects
- *
- * @var array
- */
public $children = array();
public $nodes = array();
-
- /**
- * The parent node object
- *
- * @var object|null
- */
public $parent = null;
-
- // The "info" array - see HDOM_INFO_... for what each element contains.
public $_ = array();
-
- /**
- * Start position of the tag in the document
- *
- * @var int
- */
public $tag_start = 0;
-
- /**
- * The DOM object
- *
- * @var object|null
- */
private $dom = null;
- /**
- * Construct new node object
- *
- * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
- */
function __construct($dom)
{
$this->dom = $dom;
@@ -269,7 +151,6 @@ class simple_html_dom_node
return $this->outertext();
}
- // clean up memory due to php5 circular references memory leak...
function clear()
{
$this->dom = null;
@@ -278,17 +159,14 @@ class simple_html_dom_node
$this->children = null;
}
- // dump node's tree
- function dump($show_attr = true, $deep = 0)
+ function dump($show_attr = true, $depth = 0)
{
- $lead = str_repeat(' ', $deep);
-
- echo $lead . $this->tag;
+ echo str_repeat("\t", $depth) . $this->tag;
if ($show_attr && count($this->attr) > 0) {
echo '(';
foreach ($this->attr as $k => $v) {
- echo "[$k]=>\"" . $this->$k . '", ';
+ echo "[$k]=>\"$v\", ";
}
echo ')';
}
@@ -296,14 +174,12 @@ class simple_html_dom_node
echo "\n";
if ($this->nodes) {
- foreach ($this->nodes as $c) {
- $c->dump($show_attr, $deep + 1);
+ foreach ($this->nodes as $node) {
+ $node->dump($show_attr, $depth + 1);
}
}
}
-
- // Debugging function to dump a single dom node with a bunch of information about it.
function dump_node($echo = true)
{
$string = $this->tag;
@@ -311,7 +187,7 @@ class simple_html_dom_node
if (count($this->attr) > 0) {
$string .= '(';
foreach ($this->attr as $k => $v) {
- $string .= "[$k]=>\"" . $this->$k . '", ';
+ $string .= "[$k]=>\"$v\", ";
}
$string .= ')';
}
@@ -322,24 +198,24 @@ class simple_html_dom_node
if (is_array($v)) {
$string .= "[$k]=>(";
foreach ($v as $k2 => $v2) {
- $string .= "[$k2]=>\"" . $v2 . '", ';
+ $string .= "[$k2]=>\"$v2\", ";
}
$string .= ')';
} else {
- $string .= "[$k]=>\"" . $v . '", ';
+ $string .= "[$k]=>\"$v\", ";
}
}
$string .= ')';
}
if (isset($this->text)) {
- $string .= ' text: (' . $this->text . ')';
+ $string .= " text: ({$this->text})";
}
- $string .= " HDOM_INNER_INFO: '";
+ $string .= ' HDOM_INNER_INFO: ';
if (isset($node->_[HDOM_INFO_INNER])) {
- $string .= $node->_[HDOM_INFO_INNER] . "'";
+ $string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
} else {
$string .= ' NULL ';
}
@@ -357,13 +233,6 @@ class simple_html_dom_node
}
}
- /**
- * Return or set parent node
- *
- * @param object|null $parent (optional) The parent node, `null` to return
- * the current parent node.
- * @return object|null The parent node
- */
function parent($parent = null)
{
// I am SURE that this doesn't work properly.
@@ -378,22 +247,11 @@ class simple_html_dom_node
return $this->parent;
}
- /**
- * @return bool True if the node has at least one child node
- */
function has_child()
{
return !empty($this->children);
}
- /**
- * Get child node at specified index
- *
- * @param int $idx The index of the child node to return, `-1` to return all
- * child nodes.
- * @return object|array|null The child node at the specified index, all child
- * nodes or null if the index is invalid.
- */
function children($idx = -1)
{
if ($idx === -1) {
@@ -407,15 +265,6 @@ class simple_html_dom_node
return null;
}
- /**
- * Get first child node
- *
- * @return object|null The first child node or null if the current node has
- * no child nodes.
- *
- * @todo Use `empty()` instead of `count()` to improve performance on large
- * arrays.
- */
function first_child()
{
if (count($this->children) > 0) {
@@ -424,108 +273,70 @@ class simple_html_dom_node
return null;
}
- /**
- * Get last child node
- *
- * @return object|null The last child node or null if the current node has
- * no child nodes.
- *
- * @todo Use `end()` to slightly improve performance on large arrays.
- */
function last_child()
{
- if (($count = count($this->children)) > 0) {
- return $this->children[$count - 1];
+ if (count($this->children) > 0) {
+ return end($this->children);
}
return null;
}
- /**
- * Get next sibling node
- *
- * @return object|null The sibling node or null if the current node has no
- * sibling nodes.
- */
function next_sibling()
{
if ($this->parent === null) {
return null;
}
- $idx = 0;
- $count = count($this->parent->children);
+ $idx = array_search($this, $this->parent->children, true);
- while ($idx < $count && $this !== $this->parent->children[$idx]) {
- ++$idx;
+ if ($idx !== false && isset($this->parent->children[$idx + 1])) {
+ return $this->parent->children[$idx + 1];
}
- if (++$idx >= $count) {
+ return null;
+ }
+
+ function prev_sibling()
+ {
+ if ($this->parent === null) {
return null;
}
- return $this->parent->children[$idx];
- }
+ $idx = array_search($this, $this->parent->children, true);
- /**
- * Get previous sibling node
- *
- * @return object|null The sibling node or null if the current node has no
- * sibling nodes.
- */
- function prev_sibling()
- {
- if ($this->parent === null) { return null; }
-
- $idx = 0;
- $count = count($this->parent->children);
-
- while ($idx < $count && $this !== $this->parent->children[$idx]) {
- ++$idx;
+ if ($idx !== false && $idx > 0) {
+ return $this->parent->children[$idx - 1];
}
- if (--$idx < 0) { return null; }
-
- return $this->parent->children[$idx];
+ return null;
}
- /**
- * Traverse ancestors to the first matching tag.
- *
- * @param string $tag Tag to find
- * @return object|null First matching node in the DOM tree or null if no
- * match was found.
- *
- * @todo Null is returned implicitly by calling ->parent on the root node.
- * This behaviour could change at any time, rendering this function invalid.
- */
function find_ancestor_tag($tag)
{
global $debug_object;
if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
- // Start by including ourselves in the comparison.
- $returnDom = $this;
+ if ($this->parent === null) {
+ return null;
+ }
- while (!is_null($returnDom)) {
+ $ancestor = $this->parent;
+
+ while (!is_null($ancestor)) {
if (is_object($debug_object)) {
- $debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag);
+ $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
}
- if ($returnDom->tag == $tag) {
+ if ($ancestor->tag === $tag) {
break;
}
- $returnDom = $returnDom->parent;
+ $ancestor = $ancestor->parent;
}
- return $returnDom;
+ return $ancestor;
}
- /**
- * Get node's inner text (everything inside the opening and closing tags)
- *
- * @return string
- */
function innertext()
{
if (isset($this->_[HDOM_INFO_INNER])) {
@@ -545,11 +356,6 @@ class simple_html_dom_node
return $ret;
}
- /**
- * Get node's outer text (everything including the opening and closing tags)
- *
- * @return string
- */
function outertext()
{
global $debug_object;
@@ -566,9 +372,11 @@ class simple_html_dom_node
$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
}
- if ($this->tag === 'root') return $this->innertext();
+ if ($this->tag === 'root') {
+ return $this->innertext();
+ }
- // trigger callback
+ // todo: What is the use of this callback? Remove?
if ($this->dom && $this->dom->callback !== null) {
call_user_func_array($this->dom->callback, array($this));
}
@@ -581,29 +389,23 @@ class simple_html_dom_node
return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
}
- // render begin tag
+ $ret = '';
+
if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
- } else {
- $ret = '';
}
- // render inner text
if (isset($this->_[HDOM_INFO_INNER])) {
- // If it's a br tag... don't return the HDOM_INNER_INFO that we
- // may or may not have added.
+ // todo:
should either never have HDOM_INFO_INNER or always
if ($this->tag !== 'br') {
$ret .= $this->_[HDOM_INFO_INNER];
}
- } else {
- if ($this->nodes) {
- foreach ($this->nodes as $n) {
- $ret .= $this->convert_text($n->outertext());
- }
+ } elseif ($this->nodes) {
+ foreach ($this->nodes as $n) {
+ $ret .= $this->convert_text($n->outertext());
}
}
- // render end tag
if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
$ret .= '' . $this->tag . '>';
}
@@ -611,11 +413,6 @@ class simple_html_dom_node
return $ret;
}
- /**
- * Get node's plain text (everything excluding all tags)
- *
- * @return string
- */
function text()
{
if (isset($this->_[HDOM_INFO_INNER])) {
@@ -642,7 +439,7 @@ class simple_html_dom_node
foreach ($this->nodes as $n) {
// Start paragraph after a blank line
if ($n->tag === 'p') {
- $ret .= "\n\n";
+ $ret = trim($ret) . "\n\n";
}
$ret .= $this->convert_text($n->text());
@@ -655,14 +452,9 @@ class simple_html_dom_node
}
}
}
- return trim($ret);
+ return $ret;
}
- /**
- * Get node's xml text (inner text as a CDATA section)
- *
- * @return string
- */
function xmltext()
{
$ret = $this->innertext();
@@ -671,7 +463,6 @@ class simple_html_dom_node
return $ret;
}
- // build node's text with tag
function makeup()
{
// text, comment, unknown
@@ -715,18 +506,6 @@ class simple_html_dom_node
return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
}
- /**
- * Find elements by CSS selector
- *
- * @param string $selector The CSS selector
- * @param int|null $idx Index of element to return form the list of matching
- * elements (default: `null` = disabled).
- * @param bool $lowercase Matches tag names case insensitive (lowercase) if
- * enabled (default: `false`)
- * @return array|object|null A list of elements matching the specified CSS
- * selector or a single element if $idx is specified or null if no element
- * was found.
- */
function find($selector, $idx = null, $lowercase = false)
{
$selectors = $this->parse_selector($selector);
@@ -779,19 +558,6 @@ class simple_html_dom_node
return (isset($found[$idx])) ? $found[$idx] : null;
}
- /**
- * Seek DOM elements by selector
- *
- * **Note**
- * The selector element must be compatible to a selector from
- * {@see simple_html_dom_node::parse_selector()}
- *
- * @param array $selector A selector element
- * @param array $ret An array of matches
- * @param bool $lowercase Matches tag names case insensitive (lowercase) if
- * enabled (default: `false`)
- * @return void
- */
protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
{
global $debug_object;
@@ -823,7 +589,8 @@ class simple_html_dom_node
&& $this->parent
&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator
$index = array_search($this, $this->parent->children, true) + 1;
- $nodes[] = $this->parent->children[$index];
+ if ($index < count($this->parent->children))
+ $nodes[] = $this->parent->children[$index];
} elseif ($parent_cmd === '~'
&& $this->parent
&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
@@ -1006,24 +773,6 @@ class simple_html_dom_node
}
}
- /**
- * Match value and pattern for a given CSS expression
- *
- * **Supported Expressions**
- *
- * | Expression | Description
- * | ---------- | -----------
- * | `=` | $value and $pattern must be equal
- * | `!=` | $value and $pattern must not be equal
- * | `^=` | $value must start with $pattern
- * | `$=` | $value must end with $pattern
- * | `*=` | $value must contain $pattern
- *
- * @param string $exp The expression.
- * @param string $pattern The pattern
- * @param string $value The value
- * @value bool True if $value matches $pattern
- */
protected function match($exp, $pattern, $value, $case_sensitivity)
{
global $debug_object;
@@ -1069,31 +818,6 @@ class simple_html_dom_node
return false;
}
- /**
- * Parse CSS selector
- *
- * @param string $selector_string CSS selector string
- * @return array List of CSS selectors. The format depends on the type of
- * selector:
- *
- * ```php
- *
- * array( // list of selectors (each separated by a comma), i.e. 'img, p, div'
- * array( // list of combinator selectors, i.e. 'img > p > div'
- * array( // selector element
- * [0], // (string) The element tag
- * [1], // (string) The element id
- * [2], // (array) The element classes
- * [3], // (array>) The list of attributes, each
- * // with four elements: name, expression, value, inverted
- * [4] // (string) The selector combinator (' ' | '>' | '+' | '~')
- * )
- * )
- * )
- * ```
- *
- * @link https://www.w3.org/TR/selectors/#compound Compound selector
- */
protected function parse_selector($selector_string)
{
global $debug_object;
@@ -1185,7 +909,7 @@ class simple_html_dom_node
*/
if($m[4] !== '') {
preg_match_all(
- "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is",
+ "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
trim($m[4]),
$attributes,
PREG_SET_ORDER
@@ -1285,8 +1009,6 @@ class simple_html_dom_node
if (isset($this->attr[$name])) { unset($this->attr[$name]); }
}
- // PaperG - Function to convert the text from one character set to another
- // if the two sets are not the same.
function convert_text($text)
{
global $debug_object;
@@ -1337,12 +1059,6 @@ class simple_html_dom_node
return $converted_text;
}
- /**
- * Returns true if $string is valid UTF-8 and false otherwise.
- *
- * @param mixed $str String to be tested
- * @return boolean
- */
static function is_utf8($str)
{
$c = 0; $b = 0;
@@ -1370,16 +1086,6 @@ class simple_html_dom_node
return true;
}
- /**
- * Function to try a few tricks to determine the displayed size of an img on
- * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all
- * other tag types.
- *
- * @author John Schlick
- * @version April 19 2012
- * @return array an array containing the 'height' and 'width' of the image
- * on the page or -1 if we can't figure it out.
- */
function get_display_size()
{
global $debug_object;
@@ -1465,7 +1171,82 @@ class simple_html_dom_node
return $result;
}
- // camel naming conventions
+ function save($filepath = '')
+ {
+ $ret = $this->outertext();
+
+ if ($filepath !== '') {
+ file_put_contents($filepath, $ret, LOCK_EX);
+ }
+
+ return $ret;
+ }
+
+ function addClass($class)
+ {
+ if (is_string($class)) {
+ $class = explode(' ', $class);
+ }
+
+ if (is_array($class)) {
+ foreach($class as $c) {
+ if (isset($this->class)) {
+ if ($this->hasClass($c)) {
+ continue;
+ } else {
+ $this->class .= ' ' . $c;
+ }
+ } else {
+ $this->class = $c;
+ }
+ }
+ } else {
+ if (is_object($debug_object)) {
+ $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
+ }
+ }
+ }
+
+ function hasClass($class)
+ {
+ if (is_string($class)) {
+ if (isset($this->class)) {
+ return in_array($class, explode(' ', $this->class), true);
+ }
+ } else {
+ if (is_object($debug_object)) {
+ $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
+ }
+ }
+
+ return false;
+ }
+
+ function removeClass($class = null)
+ {
+ if (!isset($this->class)) {
+ return;
+ }
+
+ if (is_null($class)) {
+ $this->removeAttribute('class');
+ return;
+ }
+
+ if (is_string($class)) {
+ $class = explode(' ', $class);
+ }
+
+ if (is_array($class)) {
+ $class = array_diff(explode(' ', $this->class), $class);
+ if (empty($class)) {
+ $this->removeAttribute('class');
+ } else {
+ $this->class = implode(' ', $class);
+ }
+ }
+ }
+
function getAllAttributes()
{
return $this->attr;
@@ -1491,6 +1272,44 @@ class simple_html_dom_node
$this->__set($name, null);
}
+ function remove()
+ {
+ if ($this->parent) {
+ $this->parent->removeChild($this);
+ }
+ }
+
+ function removeChild($node)
+ {
+ $nidx = array_search($node, $this->nodes, true);
+ $cidx = array_search($node, $this->children, true);
+ $didx = array_search($node, $this->dom->nodes, true);
+
+ if ($nidx !== false && $cidx !== false && $didx !== false) {
+
+ foreach($node->children as $child) {
+ $node->removeChild($child);
+ }
+
+ foreach($node->nodes as $entity) {
+ $enidx = array_search($entity, $node->nodes, true);
+ $edidx = array_search($entity, $node->dom->nodes, true);
+
+ if ($enidx !== false && $edidx !== false) {
+ unset($node->nodes[$enidx]);
+ unset($node->dom->nodes[$edidx]);
+ }
+ }
+
+ unset($this->nodes[$nidx]);
+ unset($this->children[$cidx]);
+ unset($this->dom->nodes[$didx]);
+
+ $node->clear();
+
+ }
+ }
+
function getElementById($id)
{
return $this->find("#$id", 0);
@@ -1559,170 +1378,34 @@ class simple_html_dom_node
}
-/**
- * simple html dom parser
- *
- * Paperg - in the find routine: allow us to specify that we want case
- * insensitive testing of the value of the selector.
- *
- * Paperg - change $size from protected to public so we can easily access it
- *
- * Paperg - added ForceTagsClosed in the constructor which tells us whether we
- * trust the html or not. Default is to NOT trust it.
- *
- * @package PlaceLocalInclude
- */
class simple_html_dom
{
- /**
- * The root node of the document
- *
- * @var object
- */
public $root = null;
-
- /**
- * List of nodes in the current DOM
- *
- * @var array
- */
public $nodes = array();
-
- /**
- * Callback function to run for each element in the DOM.
- *
- * @var callable|null
- */
public $callback = null;
-
- /**
- * Indicates how tags and attributes are matched
- *
- * @var bool When set to **true** tags and attributes will be converted to
- * lowercase before matching.
- */
public $lowercase = false;
-
- /**
- * Original document size
- *
- * Holds the original document size.
- *
- * @var int
- */
public $original_size;
-
- /**
- * Current document size
- *
- * Holds the current document size. The document size is determined by the
- * string length of ({@see simple_html_dom::$doc}).
- *
- * _Note_: Using this variable is more efficient than calling `strlen($doc)`
- *
- * @var int
- * */
public $size;
- /**
- * Current position in the document
- *
- * @var int
- */
protected $pos;
-
- /**
- * The document
- *
- * @var string
- */
protected $doc;
-
- /**
- * Current character
- *
- * Holds the current character at position {@see simple_html_dom::$pos} in
- * the document {@see simple_html_dom::$doc}
- *
- * _Note_: Using this variable is more efficient than calling
- * `substr($doc, $pos, 1)`
- *
- * @var string
- */
protected $char;
protected $cursor;
-
- /**
- * Parent node of the next node detected by the parser
- *
- * @var object
- */
protected $parent;
protected $noise = array();
-
- /**
- * Tokens considered blank in HTML
- *
- * @var string
- */
protected $token_blank = " \t\r\n";
-
- /**
- * Tokens to identify the equal sign for attributes, stopping either at the
- * closing tag ("/" i.e. "") or the end of an opening tag (">" i.e.
- * "")
- *
- * @var string
- */
protected $token_equal = ' =/>';
-
- /**
- * Tokens to identify the end of a tag name. A tag name either ends on the
- * ending slash ("/" i.e. "") or whitespace ("\s\r\n\t")
- *
- * @var string
- */
protected $token_slash = " />\r\n\t";
-
- /**
- * Tokens to identify the end of an attribute
- *
- * @var string
- */
protected $token_attr = ' >';
- // Note that this is referenced by a child node, and so it needs to be
- // public for that node to see this information.
public $_charset = '';
public $_target_charset = '';
- /**
- * Innertext for
elements
- *
- * @var string
- */
protected $default_br_text = '';
- /**
- * Suffix for elements
- *
- * @var string
- */
public $default_span_text = '';
- /**
- * Defines a list of self-closing tags (Void elements) according to the HTML
- * Specification
- *
- * _Remarks_:
- * - Use `isset()` instead of `in_array()` on array elements to boost
- * performance about 30%
- * - Sort elements by name for better readability!
- *
- * @link https://www.w3.org/TR/html HTML Specification
- * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
- */
protected $self_closing_tags = array(
'area' => 1,
'base' => 1,
@@ -1739,18 +1422,6 @@ class simple_html_dom
'track' => 1,
'wbr' => 1
);
-
- /**
- * Defines a list of tags which - if closed - close all optional closing
- * elements within if they haven't been closed yet. (So, an element where
- * neither opening nor closing tag is omissible consistently closes every
- * optional closing element within)
- *
- * _Remarks_:
- * - Use `isset()` instead of `in_array()` on array elements to boost
- * performance about 30%
- * - Sort elements by name for better readability!
- */
protected $block_tags = array(
'body' => 1,
'div' => 1,
@@ -1759,62 +1430,6 @@ class simple_html_dom
'span' => 1,
'table' => 1
);
-
- /**
- * Defines elements whose end tag is omissible.
- *
- * * key = Name of an element whose end tag is omissible.
- * * value = Names of elements whose end tag is omissible, that are closed
- * by the current element.
- *
- * _Remarks_:
- * - Use `isset()` instead of `in_array()` on array elements to boost
- * performance about 30%
- * - Sort elements by name for better readability!
- *
- * **Example**
- *
- * An `li` element’s end tag may be omitted if the `li` element is immediately
- * followed by another `li` element. To do that, add following element to the
- * array:
- *
- * ```php
- * 'li' => array('li'),
- * ```
- *
- * With this, the following two examples are considered equal. Note that the
- * second example is missing the closing tags on `li` elements.
- *
- * ```html
- *
- * ```
- *
- *
- *
- * ```html
- *
- * ```
- *
- *
- *
- * @var array A two-dimensional array where the key is the name of an
- * element whose end tag is omissible and the value is an array of elements
- * whose end tag is omissible, that are closed by the current element.
- *
- * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
- *
- * @todo The implementation of optional closing tags doesn't work in all cases
- * because it only consideres elements who close other optional closing
- * tags, not taking into account that some (non-blocking) tags should close
- * these optional closing tags. For example, the end tag for "p" is omissible
- * and can be closed by an "address" element, whose end tag is NOT omissible.
- * Currently a "p" element without closing tag stops at the next "p" element
- * or blocking tag, even if it contains other elements.
- *
- * @todo Known sourceforge issue #2977341
- * B tags that are not closed cause us to return everything to the end of
- * the document.
- */
protected $optional_closing_tags = array(
// Not optional, see
// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
@@ -1873,7 +1488,6 @@ class simple_html_dom
$this->clear();
}
- // load html from string
function load(
$str,
$lowercase = true,
@@ -1928,7 +1542,6 @@ class simple_html_dom
return $this;
}
- // load html from file
function load_file()
{
$args = func_get_args();
@@ -1940,29 +1553,16 @@ class simple_html_dom
}
}
- /**
- * Set the callback function
- *
- * @param callable $function_name Callback function to run for each element
- * in the DOM.
- * @return void
- */
function set_callback($function_name)
{
$this->callback = $function_name;
}
- /**
- * Remove callback function
- *
- * @return void
- */
function remove_callback()
{
$this->callback = null;
}
- // save dom as string
function save($filepath = '')
{
$ret = $this->root->innertext();
@@ -1970,18 +1570,18 @@ class simple_html_dom
return $ret;
}
- // find dom node by css selector
- // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
function find($selector, $idx = null, $lowercase = false)
{
return $this->root->find($selector, $idx, $lowercase);
}
- // clean up memory due to php5 circular references memory leak...
function clear()
{
- foreach ($this->nodes as $n) {
- $n->clear(); $n = null;
+ if (isset($this->nodes)) {
+ foreach ($this->nodes as $n) {
+ $n->clear();
+ $n = null;
+ }
}
// This add next line is documented in the sourceforge repository.
@@ -1989,7 +1589,8 @@ class simple_html_dom
// use of clear.
if (isset($this->children)) {
foreach ($this->children as $n) {
- $n->clear(); $n = null;
+ $n->clear();
+ $n = null;
}
}
@@ -2012,7 +1613,6 @@ class simple_html_dom
$this->root->dump($show_attr);
}
- // prepare HTML data and init everything
protected function prepare(
$str, $lowercase = true,
$defaultBRText = DEFAULT_BR_TEXT,
@@ -2038,11 +1638,6 @@ class simple_html_dom
if ($this->size > 0) { $this->char = $this->doc[0]; }
}
- /**
- * Parse HTML content
- *
- * @return bool True on success
- */
protected function parse()
{
while (true) {
@@ -2064,13 +1659,6 @@ class simple_html_dom
}
}
- // PAPERG - dkchou - added this to try to identify the character set of the
- // page we have just parsed so we know better how to spit it out later.
- // NOTE: IF you provide a routine called
- // get_last_retrieve_url_contents_content_type which returns the
- // CURLINFO_CONTENT_TYPE from the last curl_exec
- // (or the content_type header from the last transfer), we will parse THAT,
- // and if a charset is specified, we will use it over any other mechanism.
protected function parse_charset()
{
global $debug_object;
@@ -2092,6 +1680,7 @@ class simple_html_dom
}
if (empty($charset)) {
+ // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
if (!empty($el)) {
@@ -2128,53 +1717,77 @@ class simple_html_dom
}
}
- // If we couldn't find a charset above, then lets try to detect one
- // based on the text we got...
if (empty($charset)) {
- // Use this in case mb_detect_charset isn't installed/loaded on
- // this machine.
- $charset = false;
- if (function_exists('mb_detect_encoding')) {
- // Have php try to detect the encoding from the text given to us.
- $charset = mb_detect_encoding(
- $this->doc . 'ascii',
- $encoding_list = array( 'UTF-8', 'CP1252' )
- );
-
+ // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
+ if ($meta = $this->root->find('meta[charset]', 0)) {
+ $charset = $meta->charset;
if (is_object($debug_object)) {
- $debug_object->debug_log(2, 'mb_detect found: ' . $charset);
+ $debug_object->debug_log(2, 'meta charset: ' . $charset);
}
}
+ }
- // and if this doesn't work... then we need to just wrongheadedly
- // assume it's UTF-8 so that we can move on - cause this will
- // usually give us most of what we need...
- if ($charset === false) {
- if (is_object($debug_object)) {
- $debug_object->debug_log(
- 2,
- 'since mb_detect failed - using default of utf-8'
- );
+ if (empty($charset)) {
+ // Try to guess the charset based on the content
+ // Requires Multibyte String (mbstring) support (optional)
+ if (function_exists('mb_detect_encoding')) {
+ /**
+ * mb_detect_encoding() is not intended to distinguish between
+ * charsets, especially single-byte charsets. Its primary
+ * purpose is to detect which multibyte encoding is in use,
+ * i.e. UTF-8, UTF-16, shift-JIS, etc.
+ *
+ * -- https://bugs.php.net/bug.php?id=38138
+ *
+ * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
+ * always result in CP1251/ISO-8859-5 and vice versa.
+ *
+ * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
+ * to stay compatible.
+ */
+ $encoding = mb_detect_encoding(
+ $this->doc,
+ array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
+ );
+
+ if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
+ // Due to a limitation of mb_detect_encoding
+ // 'CP1251'/'ISO-8859-5' will be detected as
+ // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
+ // which case we can simply assume it is the other charset.
+ if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
+ $encoding = 'CP1251';
+ }
}
- $charset = 'UTF-8';
+ if ($encoding !== false) {
+ $charset = $encoding;
+ if (is_object($debug_object)) {
+ $debug_object->debug_log(2, 'mb_detect: ' . $charset);
+ }
+ }
+ }
+ }
+
+ if (empty($charset)) {
+ // Assume it's UTF-8 as it is the most likely charset to be used
+ $charset = 'UTF-8';
+ if (is_object($debug_object)) {
+ $debug_object->debug_log(2, 'No match found, assume ' . $charset);
}
}
// Since CP1252 is a superset, if we get one of it's subsets, we want
// it instead.
- if ((strtolower($charset) == strtolower('ISO-8859-1'))
- || (strtolower($charset) == strtolower('Latin1'))
- || (strtolower($charset) == strtolower('Latin-1'))) {
-
+ if ((strtolower($charset) == 'iso-8859-1')
+ || (strtolower($charset) == 'latin1')
+ || (strtolower($charset) == 'latin-1')) {
+ $charset = 'CP1252';
if (is_object($debug_object)) {
- $debug_object->debug_log(
- 2,
+ $debug_object->debug_log(2,
'replacing ' . $charset . ' with CP1252 as its a superset'
);
}
-
- $charset = 'CP1252';
}
if (is_object($debug_object)) {
@@ -2184,11 +1797,6 @@ class simple_html_dom
return $this->_charset = $charset;
}
- /**
- * Parse tag from current document position.
- *
- * @return bool True if a tag was found, false otherwise
- */
protected function read_tag()
{
// Set end position if no further tags found
@@ -2467,63 +2075,50 @@ class simple_html_dom
return true;
}
- /**
- * Parse attribute from current document position
- *
- * @param object $node Node for the attributes
- * @param string $name Name of the current attribute
- * @param array $space Array for spacing information
- * @return void
- */
protected function parse_attr($node, $name, &$space)
{
- // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
- // If the attribute is already defined inside a tag, only pay attention
- // to the first one as opposed to the last one.
- // https://stackoverflow.com/a/26341866
- if (isset($node->attr[$name])) {
- return;
- }
+ $is_duplicate = isset($node->attr[$name]);
- // [2] Whitespace between "=" and the value
- $space[2] = $this->copy_skip($this->token_blank);
+ if (!$is_duplicate) // Copy whitespace between "=" and value
+ $space[2] = $this->copy_skip($this->token_blank);
switch ($this->char) {
- case '"': // value is anything between double quotes
- $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
+ case '"':
+ $quote_type = HDOM_QUOTE_DOUBLE;
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
- $node->attr[$name] = $this->restore_noise($this->copy_until_char('"'));
+ $value = $this->copy_until_char('"');
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
break;
- case '\'': // value is anything between single quotes
- $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
+ case '\'':
+ $quote_type = HDOM_QUOTE_SINGLE;
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
- $node->attr[$name] = $this->restore_noise($this->copy_until_char('\''));
+ $value = $this->copy_until_char('\'');
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
break;
- default: // value is anything until the first space or end tag
- $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
- $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
+ default:
+ $quote_type = HDOM_QUOTE_NO;
+ $value = $this->copy_until($this->token_attr);
}
+
+ $value = $this->restore_noise($value);
+
// PaperG: Attributes should not have \r or \n in them, that counts as
// html whitespace.
- $node->attr[$name] = str_replace("\r", '', $node->attr[$name]);
- $node->attr[$name] = str_replace("\n", '', $node->attr[$name]);
+ $value = str_replace("\r", '', $value);
+ $value = str_replace("\n", '', $value);
+
// PaperG: If this is a "class" selector, lets get rid of the preceeding
// and trailing space since some people leave it in the multi class case.
if ($name === 'class') {
- $node->attr[$name] = trim($node->attr[$name]);
+ $value = trim($value);
+ }
+
+ if (!$is_duplicate) {
+ $node->_[HDOM_INFO_QUOTE][] = $quote_type;
+ $node->attr[$name] = $value;
}
}
- /**
- * Link node to parent node
- *
- * @param object $node Node to link to parent
- * @param bool $is_child True if the node is a child of parent
- * @return void
- */
- // link node's parent
protected function link_nodes(&$node, $is_child)
{
$node->parent = $this->parent;
@@ -2533,12 +2128,6 @@ class simple_html_dom
}
}
- /**
- * Add tag as text node to current node
- *
- * @param string $tag Tag name
- * @return bool True on success
- */
protected function as_text_node($tag)
{
$node = new simple_html_dom_node($this);
@@ -2549,28 +2138,12 @@ class simple_html_dom
return true;
}
- /**
- * Seek from the current document position to the first occurrence of a
- * character not defined by the provided string. Update the current document
- * position to the new position.
- *
- * @param string $chars A string containing every allowed character.
- * @return void
- */
protected function skip($chars)
{
$this->pos += strspn($this->doc, $chars, $this->pos);
$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
}
- /**
- * Copy substring from the current document position to the first occurrence
- * of a character not defined by the provided string.
- *
- * @param string $chars A string containing every allowed character.
- * @return string Substring from the current document position to the first
- * occurrence of a character not defined by the provided string.
- */
protected function copy_skip($chars)
{
$pos = $this->pos;
@@ -2581,14 +2154,6 @@ class simple_html_dom
return substr($this->doc, $pos, $len);
}
- /**
- * Copy substring from the current document position to the first occurrence
- * of any of the provided characters.
- *
- * @param string $chars A string containing every character to stop at.
- * @return string Substring from the current document position to the first
- * occurrence of any of the provided characters.
- */
protected function copy_until($chars)
{
$pos = $this->pos;
@@ -2598,14 +2163,6 @@ class simple_html_dom
return substr($this->doc, $pos, $len);
}
- /**
- * Copy substring from the current document position to the first occurrence
- * of the provided string.
- *
- * @param string $char The string to stop at.
- * @return string Substring from the current document position to the first
- * occurrence of the provided string.
- */
protected function copy_until_char($char)
{
if ($this->char === null) { return ''; }
@@ -2625,15 +2182,6 @@ class simple_html_dom
return substr($this->doc, $pos_old, $pos - $pos_old);
}
- /**
- * Remove noise from HTML content
- *
- * Noise is stored to {@see simple_html_dom::$noise}
- *
- * @param string $pattern The regex pattern used for finding noise
- * @param bool $remove_tag True to remove the entire match. Default is false
- * to only remove the captured data.
- */
protected function remove_noise($pattern, $remove_tag = false)
{
global $debug_object;
@@ -2666,14 +2214,6 @@ class simple_html_dom
}
}
- /**
- * Restore noise to HTML content
- *
- * Noise is restored from {@see simple_html_dom::$noise}
- *
- * @param string $text A subset of HTML containing noise
- * @return string The same content with noise restored
- */
function restore_noise($text)
{
global $debug_object;
@@ -2720,7 +2260,6 @@ class simple_html_dom
return $text;
}
- // Sometimes we NEED one of the noise elements.
function search_noise($text)
{
global $debug_object;
@@ -2754,7 +2293,6 @@ class simple_html_dom
}
}
- // camel naming conventions
function childNodes($idx = -1)
{
return $this->root->childNodes($idx);
@@ -2772,7 +2310,7 @@ class simple_html_dom
function createElement($name, $value = null)
{
- return @str_get_html("<$name>$value$name>")->first_child();
+ return @str_get_html("<$name>$value$name>")->firstChild();
}
function createTextNode($value)