From 977c0db38222e22b364578ba1d78e800445e44f9 Mon Sep 17 00:00:00 2001
From: ORelio <ORelio@users.noreply.github.com>
Date: Wed, 26 Jul 2023 19:41:29 +0200
Subject: [PATCH] [CssSelectorBridge] Improvements (#3537) (#3573)

* [CssSelectorBridge] Improvements (#3537)

* Improve parameter documentation / add tooltips
* Allow extracting content from home page instead of article page
* Keep titles from home page when every page <title> is the same

* [CssSelectorBridge] Code linting

* [CssSelectorBridge] Code linting (2)

* [CssSelectorBridge] Code linting (3)
---
 bridges/CssSelectorBridge.php | 103 ++++++++++++++++++++++++++--------
 1 file changed, 80 insertions(+), 23 deletions(-)
diff --git a/bridges/CssSelectorBridge.php b/bridges/CssSelectorBridge.php
index ae135113..2d7489de 100644
--- a/bridges/CssSelectorBridge.php
+++ b/bridges/CssSelectorBridge.php
@@ -15,23 +15,40 @@ class CssSelectorBridge extends BridgeAbstract
             ],
             'url_selector' => [
                 'name' => 'Selector for article links or their parent elements',
+                'title' => <<<EOT
+                    This bridge works using CSS selectors, e.g. "a.article" will match all <a class="article" 
+                    href="URL">TITLE</a> on home page, each one being treated as a feed item. &#10;&#13;
+                    Instead of just a link you can selet one of its parent element. Everything inside that
+                    element becomes feed item content, e.g. image and summary present on home page.
+                    When doing so, the first link inside the selected element becomes feed item URL/Title.
+                    EOT,
                 'exampleValue' => 'a.article',
                 'required' => true
             ],
             'url_pattern' => [
                 'name' => '[Optional] Pattern for site URLs to keep in feed',
-                'exampleValue' => 'https://example.com/article/.*',
+                'title' => 'Optionally filter items by applying a regular expression on their URL',
+                'exampleValue' => '/blog/article/.*',
             ],
             'content_selector' => [
-                'name' => '[Optional] Selector to extract each article content',
+                'name' => '[Optional] Selector to expand each article content',
+                'title' => <<<EOT
+                    When specified, the bridge will fetch each article from its URL
+                    and extract content using the provided selector (Slower!)
+                    EOT,
                 'exampleValue' => 'article.content',
             ],
             'content_cleanup' => [
                 'name' => '[Optional] Content cleanup: List of items to remove',
+                'title' => 'Selector for unnecessary elements to remove inside article contents.',
                 'exampleValue' => 'div.ads, div.comments',
             ],
             'title_cleanup' => [
                 'name' => '[Optional] Text to remove from expanded article title',
+                'title' => <<<EOT
+                    When fetching each article page, feed item title comes from page title. 
+                    Specify here some text from page title that need to be removed, e.g. " | BlogName".
+                    EOT,
                 'exampleValue' => ' | BlogName',
             ],
             'limit' => self::LIMIT
@@ -69,7 +86,7 @@ class CssSelectorBridge extends BridgeAbstract
 
         $html = defaultLinkTo(getSimpleHTMLDOM($url), $url);
         $this->feedName = $this->getPageTitle($html, $title_cleanup);
-        $items = $this->htmlFindLinks($html, $url_selector, $url_pattern, $limit);
+        $items = $this->htmlFindEntries($html, $url_selector, $url_pattern, $limit, $content_cleanup);
 
         if (empty($content_selector)) {
             $this->items = $items;
@@ -79,7 +96,8 @@ class CssSelectorBridge extends BridgeAbstract
                     $item['uri'],
                     $content_selector,
                     $content_cleanup,
-                    $title_cleanup
+                    $title_cleanup,
+                    $item['title']
                 );
             }
         }
@@ -127,30 +145,71 @@ class CssSelectorBridge extends BridgeAbstract
     }
 
     /**
-     * Retrieve first N links from webpage URL or DOM satisfying the specified criteria
-     * @param string|object $page URL or DOM to retrieve links from
+     * Remove all elements from HTML content matching cleanup selector
+     * @param string|object $content HTML content as HTML object or string
+     * @return string|object Cleaned content (same type as input)
+     */
+    protected function cleanArticleContent($content, $cleanup_selector)
+    {
+        $string_convert = false;
+        if (is_string($content)) {
+            $string_convert = true;
+            $content = str_get_html($content);
+        }
+
+        if (!empty($cleanup_selector)) {
+            foreach ($content->find($cleanup_selector) as $item_to_clean) {
+                $item_to_clean->outertext = '';
+            }
+        }
+
+        if ($string_convert) {
+            $content = $content->outertext;
+        }
+        return $content;
+    }
+
+    /**
+     * Retrieve first N link+title+truncated-content from webpage URL or DOM satisfying the specified criteria
+     * @param string|object $page URL or DOM to retrieve feed items from
      * @param string $url_selector DOM selector for matching links or their parent element
      * @param string $url_pattern Optional filter to keep only links matching the pattern
      * @param int $limit Optional maximum amount of URLs to return
-     * @return array of minimal feed items {'uri': entry_url, 'title', entry_title}
+     * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments"
+     * @return array of items {'uri': entry_url, 'title': entry_title, ['content': when present in DOM] }
      */
-    protected function htmlFindLinks($page, $url_selector, $url_pattern = '', $limit = 0)
+    protected function htmlFindEntries($page, $url_selector, $url_pattern = '', $limit = 0, $content_cleanup = null)
     {
+        if (is_string($page)) {
+            $page = getSimpleHTMLDOM($page);
+        }
+
         $links = $page->find($url_selector);
 
         if (empty($links)) {
             returnClientError('No results for URL selector');
         }
 
-        $link_to_title = [];
+        $link_to_item = [];
         foreach ($links as $link) {
+            $item = [];
+            if ($link->innertext != $link->plaintext) {
+                $item['content'] = $link->innertext;
+            }
             if ($link->tag != 'a') {
                 $link = $link->find('a', 0);
             }
-            $link_to_title[$link->href] = $link->plaintext;
+            $item['uri'] = $link->href;
+            $item['title'] = $link->plaintext;
+            if (isset($item['content'])) {
+                $item['content'] = convertLazyLoading($item['content']);
+                $item['content'] = defaultLinkTo($item['content'], $item['uri']);
+                $item['content'] = $this->cleanArticleContent($item['content'], $content_cleanup);
+            }
+            $link_to_item[$link->href] = $item;
         }
 
-        $links = $this->filterUrlList(array_keys($link_to_title), $url_pattern, $limit);
+        $links = $this->filterUrlList(array_keys($link_to_item), $url_pattern, $limit);
 
         if (empty($links)) {
             returnClientError('No results for URL pattern');
@@ -158,10 +217,7 @@ class CssSelectorBridge extends BridgeAbstract
 
         $items = [];
         foreach ($links as $link) {
-            $item = [];
-            $item['uri'] = $link;
-            $item['title'] = $link_to_title[$link];
-            $items[] = $item;
+            $items[] = $link_to_item[$link];
         }
 
         return $items;
@@ -173,9 +229,10 @@ class CssSelectorBridge extends BridgeAbstract
      * @param string $content_selector HTML selector for extracting content, e.g. "article.content"
      * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments"
      * @param string $title_cleanup Optional string to remove from article title, e.g. " | BlogName"
+     * @param string $title_default Optional title to use when could not extract title reliably
      * @return array Entry data: uri, title, content
      */
-    protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null)
+    protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null, $title_default = null)
     {
         if (empty($content_selector)) {
             returnClientError('Please specify a content selector');
@@ -190,18 +247,18 @@ class CssSelectorBridge extends BridgeAbstract
             returnClientError('Could not find content selector at URL: ' . $entry_url);
         }
 
-        if (!empty($content_cleanup)) {
-            foreach ($article_content->find($content_cleanup) as $item_to_clean) {
-                $item_to_clean->outertext = '';
-            }
-        }
-
         $article_content = convertLazyLoading($article_content);
         $article_content = defaultLinkTo($article_content, $entry_url);
+        $article_content = $this->cleanArticleContent($article_content, $content_cleanup);
+
+        $article_title = $this->getPageTitle($entry_html, $title_cleanup);
+        if (!empty($title_default) && (empty($article_title) || $article_title === $this->feedName)) {
+            $article_title = $title_default;
+        }
 
         $item = [];
         $item['uri'] = $entry_url;
-        $item['title'] = $this->getPageTitle($entry_html, $title_cleanup);
+        $item['title'] = $article_title;
         $item['content'] = $article_content;
         return $item;
     }