[CssSelectorBridge] Improvements (#3537) (#3573)

* [CssSelectorBridge] Improvements (#3537) * Improve parameter documentation / add tooltips * Allow extracting content from home page instead of article page * Keep titles from home page when every page <title> is the same * [CssSelectorBridge] Code linting * [CssSelectorBridge] Code linting (2) * [CssSelectorBridge] Code linting (3)
2023-07-26 19:41:29 +02:00 · 2023-07-26 19:41:29 +02:00 · 977c0db382
parent 556bca58cf
commit 977c0db382
1 changed files with 80 additions and 23 deletions
--- a/bridges/CssSelectorBridge.php
+++ b/bridges/CssSelectorBridge.php
@ -15,23 +15,40 @@ class CssSelectorBridge extends BridgeAbstract
            ],
            'url_selector' => [
                'name' => 'Selector for article links or their parent elements',
+                'title' => <<<EOT
+                    This bridge works using CSS selectors, e.g. "a.article" will match all <a class="article" 
+                    href="URL">TITLE</a> on home page, each one being treated as a feed item. &#10;&#13;
+                    Instead of just a link you can selet one of its parent element. Everything inside that
+                    element becomes feed item content, e.g. image and summary present on home page.
+                    When doing so, the first link inside the selected element becomes feed item URL/Title.
+                    EOT,
                'exampleValue' => 'a.article',
                'required' => true
            ],
            'url_pattern' => [
                'name' => '[Optional] Pattern for site URLs to keep in feed',
-                'exampleValue' => 'https://example.com/article/.*',
+                'title' => 'Optionally filter items by applying a regular expression on their URL',
+                'exampleValue' => '/blog/article/.*',
            ],
            'content_selector' => [
-                'name' => '[Optional] Selector to extract each article content',
+                'name' => '[Optional] Selector to expand each article content',
+                'title' => <<<EOT
+                    When specified, the bridge will fetch each article from its URL
+                    and extract content using the provided selector (Slower!)
+                    EOT,
                'exampleValue' => 'article.content',
            ],
            'content_cleanup' => [
                'name' => '[Optional] Content cleanup: List of items to remove',
+                'title' => 'Selector for unnecessary elements to remove inside article contents.',
                'exampleValue' => 'div.ads, div.comments',
            ],
            'title_cleanup' => [
                'name' => '[Optional] Text to remove from expanded article title',
+                'title' => <<<EOT
+                    When fetching each article page, feed item title comes from page title. 
+                    Specify here some text from page title that need to be removed, e.g. " | BlogName".
+                    EOT,
                'exampleValue' => ' | BlogName',
            ],
            'limit' => self::LIMIT
@ -69,7 +86,7 @@ class CssSelectorBridge extends BridgeAbstract

        $html = defaultLinkTo(getSimpleHTMLDOM($url), $url);
        $this->feedName = $this->getPageTitle($html, $title_cleanup);
-        $items = $this->htmlFindLinks($html, $url_selector, $url_pattern, $limit);
+        $items = $this->htmlFindEntries($html, $url_selector, $url_pattern, $limit, $content_cleanup);

        if (empty($content_selector)) {
            $this->items = $items;
@ -79,7 +96,8 @@ class CssSelectorBridge extends BridgeAbstract
                    $item['uri'],
                    $content_selector,
                    $content_cleanup,
-                    $title_cleanup
+                    $title_cleanup,
+                    $item['title']
                );
            }
        }
@ -127,30 +145,71 @@ class CssSelectorBridge extends BridgeAbstract
    }

    /**
-     * Retrieve first N links from webpage URL or DOM satisfying the specified criteria
-     * @param string|object $page URL or DOM to retrieve links from
+     * Remove all elements from HTML content matching cleanup selector
+     * @param string|object $content HTML content as HTML object or string
+     * @return string|object Cleaned content (same type as input)
+     */
+    protected function cleanArticleContent($content, $cleanup_selector)
+    {
+        $string_convert = false;
+        if (is_string($content)) {
+            $string_convert = true;
+            $content = str_get_html($content);
+        }
+
+        if (!empty($cleanup_selector)) {
+            foreach ($content->find($cleanup_selector) as $item_to_clean) {
+                $item_to_clean->outertext = '';
+            }
+        }
+
+        if ($string_convert) {
+            $content = $content->outertext;
+        }
+        return $content;
+    }
+
+    /**
+     * Retrieve first N link+title+truncated-content from webpage URL or DOM satisfying the specified criteria
+     * @param string|object $page URL or DOM to retrieve feed items from
     * @param string $url_selector DOM selector for matching links or their parent element
     * @param string $url_pattern Optional filter to keep only links matching the pattern
     * @param int $limit Optional maximum amount of URLs to return
-     * @return array of minimal feed items {'uri': entry_url, 'title', entry_title}
+     * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments"
+     * @return array of items {'uri': entry_url, 'title': entry_title, ['content': when present in DOM] }
     */
-    protected function htmlFindLinks($page, $url_selector, $url_pattern = '', $limit = 0)
+    protected function htmlFindEntries($page, $url_selector, $url_pattern = '', $limit = 0, $content_cleanup = null)
    {
+        if (is_string($page)) {
+            $page = getSimpleHTMLDOM($page);
+        }
+
        $links = $page->find($url_selector);

        if (empty($links)) {
            returnClientError('No results for URL selector');
        }

-        $link_to_title = [];
+        $link_to_item = [];
        foreach ($links as $link) {
+            $item = [];
+            if ($link->innertext != $link->plaintext) {
+                $item['content'] = $link->innertext;
+            }
            if ($link->tag != 'a') {
                $link = $link->find('a', 0);
            }
-            $link_to_title[$link->href] = $link->plaintext;
+            $item['uri'] = $link->href;
+            $item['title'] = $link->plaintext;
+            if (isset($item['content'])) {
+                $item['content'] = convertLazyLoading($item['content']);
+                $item['content'] = defaultLinkTo($item['content'], $item['uri']);
+                $item['content'] = $this->cleanArticleContent($item['content'], $content_cleanup);
+            }
+            $link_to_item[$link->href] = $item;
        }

-        $links = $this->filterUrlList(array_keys($link_to_title), $url_pattern, $limit);
+        $links = $this->filterUrlList(array_keys($link_to_item), $url_pattern, $limit);

        if (empty($links)) {
            returnClientError('No results for URL pattern');
@ -158,10 +217,7 @@ class CssSelectorBridge extends BridgeAbstract

        $items = [];
        foreach ($links as $link) {
-            $item = [];
-            $item['uri'] = $link;
-            $item['title'] = $link_to_title[$link];
-            $items[] = $item;
+            $items[] = $link_to_item[$link];
        }

        return $items;
@ -173,9 +229,10 @@ class CssSelectorBridge extends BridgeAbstract
     * @param string $content_selector HTML selector for extracting content, e.g. "article.content"
     * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments"
     * @param string $title_cleanup Optional string to remove from article title, e.g. " | BlogName"
+     * @param string $title_default Optional title to use when could not extract title reliably
     * @return array Entry data: uri, title, content
     */
-    protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null)
+    protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null, $title_default = null)
    {
        if (empty($content_selector)) {
            returnClientError('Please specify a content selector');
@ -190,18 +247,18 @@ class CssSelectorBridge extends BridgeAbstract
            returnClientError('Could not find content selector at URL: ' . $entry_url);
        }

-        if (!empty($content_cleanup)) {
-            foreach ($article_content->find($content_cleanup) as $item_to_clean) {
-                $item_to_clean->outertext = '';
-            }
-        }
-
        $article_content = convertLazyLoading($article_content);
        $article_content = defaultLinkTo($article_content, $entry_url);
+        $article_content = $this->cleanArticleContent($article_content, $content_cleanup);
+
+        $article_title = $this->getPageTitle($entry_html, $title_cleanup);
+        if (!empty($title_default) && (empty($article_title) || $article_title === $this->feedName)) {
+            $article_title = $title_default;
+        }

        $item = [];
        $item['uri'] = $entry_url;
-        $item['title'] = $this->getPageTitle($entry_html, $title_cleanup);
+        $item['title'] = $article_title;
        $item['content'] = $article_content;
        return $item;
    }