From d592e2cb152f144a2e323e836d359df7591a6ac0 Mon Sep 17 00:00:00 2001
From: ORelio <ORelio@users.noreply.github.com>
Date: Sun, 20 Nov 2022 12:41:59 +0100
Subject: [PATCH] [Core] Add html/convertLazyLoading (+ document
 stripRecursiveHTMLSection) (#3157)

* [core] Add html/convertLazyLoading($dom)

Looks for lazy-loading attributes such as 'data-src' and converts
them back to regular ones such as 'src', easier for RSS readers.
It also converts <picture> elements to plain <img> elements.

* [core] Document html/stripRecursiveHTMLSection()

Add documentation for that function (no code changes).

* [WordPressBridge] Use convertLazyLoading()

* [WordPressBridge] Unwrap image figures

<img> inside <figure> may not display on RSS readers.
This converts them back to <img>, without losing caption if present.

* [ZDNet] Convert lazy loading images

* [code] html/stripRecursiveHTMLSection: Fix typo
---
 bridges/WordPressBridge.php | 19 +++------
 bridges/ZDNetBridge.php     |  2 +-
 lib/html.php                | 85 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 91 insertions(+), 15 deletions(-)
diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php
index 7f419f1d..1d46958d 100644
--- a/bridges/WordPressBridge.php
+++ b/bridges/WordPressBridge.php
@@ -74,20 +74,8 @@ class WordPressBridge extends FeedExpander
             }
         }
 
-        // Convert lazy-loading images and iframes (videos...)
-        foreach ($article->find('img, iframe') as $img) {
-            if (!empty($img->getAttribute('data-src'))) {
-                $img->src = $img->getAttribute('data-src');
-            } elseif (!empty($img->getAttribute('data-srcset'))) {
-                $img->src = explode(' ', $img->getAttribute('data-srcset'))[0];
-            } elseif (!empty($img->getAttribute('data-lazy-src'))) {
-                $img->src = $img->getAttribute('data-lazy-src');
-            } elseif (!empty($img->getAttribute('srcset'))) {
-                $img->src = explode(' ', $img->getAttribute('srcset'))[0];
-            }
-        }
-
         // Find article main image
+        $article = convertLazyLoading($article);
         $article_image = $article_html->find('img.wp-post-image', 0);
         if (!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) {
             $article_image = str_get_html($item['content'])->find('img.wp-post-image', 0);
@@ -105,6 +93,11 @@ class WordPressBridge extends FeedExpander
             }
         }
 
+        // Unwrap images figures
+        foreach ($article->find('figure.wp-block-image') as $figure) {
+            $figure->outertext = $figure->innertext;
+        }
+
         if (!is_null($article)) {
             $item['content'] = $this->cleanContent($article->innertext);
             $item['content'] = defaultLinkTo($item['content'], $item['uri']);
diff --git a/bridges/ZDNetBridge.php b/bridges/ZDNetBridge.php
index 0bebeff8..693f542c 100644
--- a/bridges/ZDNetBridge.php
+++ b/bridges/ZDNetBridge.php
@@ -208,7 +208,7 @@ class ZDNetBridge extends FeedExpander
         $contents = stripWithDelimiters($contents, '<meta itemprop="image"', '>');
         $contents = stripWithDelimiters($contents, '<svg class="svg-symbol', '</svg>');
         $contents = trim(stripWithDelimiters($contents, '<section class="sharethrough-top', '</section>'));
-        $item['content'] = $contents;
+        $item['content'] = convertLazyLoading($contents);
 
         return $item;
     }
diff --git a/lib/html.php b/lib/html.php
index 873620bd..8ec30069 100644
--- a/lib/html.php
+++ b/lib/html.php
@@ -200,6 +200,69 @@ function defaultLinkTo($dom, $url)
     return $dom;
 }
 
+/**
+ * Convert lazy-loading images and frames (video embeds) into static elements
+ *
+ * This function looks for lazy-loading attributes such as 'data-src' and converts
+ * them back to regular ones such as 'src', making them loadable in RSS readers.
+ * It also converts <picture> elements to plain <img> elements.
+ *
+ * @param string|object $content The HTML content. Supports HTML objects or string objects
+ * @return string|object Content with fixed image/frame URLs (same type as input).
+ */
+function convertLazyLoading($dom)
+{
+    $string_convert = false;
+    if (is_string($dom)) {
+        $string_convert = true;
+        $dom = str_get_html($dom);
+    }
+
+    // Process standalone images, embeds and picture sources
+    foreach ($dom->find('img, iframe, source') as $img) {
+        if (!empty($img->getAttribute('data-src'))) {
+            $img->src = $img->getAttribute('data-src');
+        } elseif (!empty($img->getAttribute('data-srcset'))) {
+            $img->src = explode(' ', $img->getAttribute('data-srcset'))[0];
+        } elseif (!empty($img->getAttribute('data-lazy-src'))) {
+            $img->src = $img->getAttribute('data-lazy-src');
+        } elseif (!empty($img->getAttribute('srcset'))) {
+            $img->src = explode(' ', $img->getAttribute('srcset'))[0];
+        } else {
+            continue; // Proceed to next element without removing attributes
+        }
+        foreach (['loading', 'decoding', 'srcset', 'data-src', 'data-srcset'] as $attr) {
+            if ($img->hasAttribute($attr)) {
+                $img->removeAttribute($attr);
+            }
+        }
+    }
+
+    // Convert complex HTML5 pictures to plain, standalone images
+    // <img> and <source> tags already have their "src" attribute set at this point,
+    // so we replace the whole <picture> with a standalone <img> from within the <picture>
+    foreach ($dom->find('picture') as $picture) {
+        $img = $picture->find('img, source', 0);
+        if (!empty($img)) {
+            if ($img->tag == 'source') {
+                $img->tag = 'img';
+            }
+            // Adding/removing node would change its position inside the parent element,
+            // So instead we rewrite the node in-place though the outertext attribute
+            $picture->outertext = $img->outertext;
+        }
+    }
+
+    // If the expected return type is object, reload the DOM to make sure
+    // all $picture->outertext rewritten above are converted back to objects
+    $dom = $dom->outertext;
+    if (!$string_convert) {
+        $dom = str_get_html($dom);
+    }
+
+    return $dom;
+}
+
 /**
  * Extract the first part of a string matching the specified start and end delimiters
  *
@@ -245,27 +308,47 @@ function stripWithDelimiters($string, $start, $end)
  * @param string $tag_start Start of the HTML tag to remove, e.g. `<div class="ads">`
  * @return string Cleaned String, e.g. `foobar`
  *
- * @todo This function needs more documentation to make it maintainable.
+ * This function works by locating the desired tag start, then finding the appropriate
+ * end by counting opening and ending tags until the amount of open tags reaches zero:
+ *
+ * ```
+ * Amount of open tags:
+ *         1          2       1        0
+ * |---------------||---|   |----|   |----|
+ * <div class="ads"><div>ads</div>ads</div>bar
+ * | <-------- Section to remove -------> |
+ * ```
  */
 function stripRecursiveHTMLSection($string, $tag_name, $tag_start)
 {
     $open_tag = '<' . $tag_name;
     $close_tag = '</' . $tag_name . '>';
     $close_tag_length = strlen($close_tag);
+
+    // Make sure the provided $tag_start argument matches the provided $tag_name argument
     if (strpos($tag_start, $open_tag) === 0) {
+        // While tag_start is present, there is at least one remaining section to remove
         while (strpos($string, $tag_start) !== false) {
+            // In order to locate the end of the section, we attempt each closing tag until we find the right one
+            // We know we found the right one when the amount of "<tag" is the same as amount of "</tag"
+            // When the attempted "</tag" is not the correct one, we increase $search_offset to skip it
+            // and retry unless $max_recursion is reached (prevents infinite loop on malformed HTML)
             $max_recursion = 100;
             $section_to_remove = null;
             $section_start = strpos($string, $tag_start);
             $search_offset = $section_start;
             do {
                 $max_recursion--;
+                // Move on to the next occurrence of "</tag"
                 $section_end = strpos($string, $close_tag, $search_offset);
                 $search_offset = $section_end + $close_tag_length;
+                // If the next "</tag" is the correct one, then this is the section we must remove:
                 $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length);
+                // Count amount of "<tag" and "</tag" in the section to remove
                 $open_tag_count = substr_count($section_to_remove, $open_tag);
                 $close_tag_count = substr_count($section_to_remove, $close_tag);
             } while ($open_tag_count > $close_tag_count && $max_recursion > 0);
+            // We exited the loop, let's remove the section
             $string = str_replace($section_to_remove, '', $string);
         }
     }