From 977c0db38222e22b364578ba1d78e800445e44f9 Mon Sep 17 00:00:00 2001 From: ORelio Date: Wed, 26 Jul 2023 19:41:29 +0200 Subject: [PATCH] [CssSelectorBridge] Improvements (#3537) (#3573) * [CssSelectorBridge] Improvements (#3537) * Improve parameter documentation / add tooltips * Allow extracting content from home page instead of article page * Keep titles from home page when every page is the same * [CssSelectorBridge] Code linting * [CssSelectorBridge] Code linting (2) * [CssSelectorBridge] Code linting (3) --- bridges/CssSelectorBridge.php | 103 ++++++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 23 deletions(-) diff --git a/bridges/CssSelectorBridge.php b/bridges/CssSelectorBridge.php index ae135113..2d7489de 100644 --- a/bridges/CssSelectorBridge.php +++ b/bridges/CssSelectorBridge.php @@ -15,23 +15,40 @@ class CssSelectorBridge extends BridgeAbstract ], 'url_selector' => [ 'name' => 'Selector for article links or their parent elements', + 'title' => <<<EOT + This bridge works using CSS selectors, e.g. "a.article" will match all <a class="article" + href="URL">TITLE</a> on home page, each one being treated as a feed item. + Instead of just a link you can selet one of its parent element. Everything inside that + element becomes feed item content, e.g. image and summary present on home page. + When doing so, the first link inside the selected element becomes feed item URL/Title. + EOT, 'exampleValue' => 'a.article', 'required' => true ], 'url_pattern' => [ 'name' => '[Optional] Pattern for site URLs to keep in feed', - 'exampleValue' => 'https://example.com/article/.*', + 'title' => 'Optionally filter items by applying a regular expression on their URL', + 'exampleValue' => '/blog/article/.*', ], 'content_selector' => [ - 'name' => '[Optional] Selector to extract each article content', + 'name' => '[Optional] Selector to expand each article content', + 'title' => <<<EOT + When specified, the bridge will fetch each article from its URL + and extract content using the provided selector (Slower!) + EOT, 'exampleValue' => 'article.content', ], 'content_cleanup' => [ 'name' => '[Optional] Content cleanup: List of items to remove', + 'title' => 'Selector for unnecessary elements to remove inside article contents.', 'exampleValue' => 'div.ads, div.comments', ], 'title_cleanup' => [ 'name' => '[Optional] Text to remove from expanded article title', + 'title' => <<<EOT + When fetching each article page, feed item title comes from page title. + Specify here some text from page title that need to be removed, e.g. " | BlogName". + EOT, 'exampleValue' => ' | BlogName', ], 'limit' => self::LIMIT @@ -69,7 +86,7 @@ class CssSelectorBridge extends BridgeAbstract $html = defaultLinkTo(getSimpleHTMLDOM($url), $url); $this->feedName = $this->getPageTitle($html, $title_cleanup); - $items = $this->htmlFindLinks($html, $url_selector, $url_pattern, $limit); + $items = $this->htmlFindEntries($html, $url_selector, $url_pattern, $limit, $content_cleanup); if (empty($content_selector)) { $this->items = $items; @@ -79,7 +96,8 @@ class CssSelectorBridge extends BridgeAbstract $item['uri'], $content_selector, $content_cleanup, - $title_cleanup + $title_cleanup, + $item['title'] ); } } @@ -127,30 +145,71 @@ class CssSelectorBridge extends BridgeAbstract } /** - * Retrieve first N links from webpage URL or DOM satisfying the specified criteria - * @param string|object $page URL or DOM to retrieve links from + * Remove all elements from HTML content matching cleanup selector + * @param string|object $content HTML content as HTML object or string + * @return string|object Cleaned content (same type as input) + */ + protected function cleanArticleContent($content, $cleanup_selector) + { + $string_convert = false; + if (is_string($content)) { + $string_convert = true; + $content = str_get_html($content); + } + + if (!empty($cleanup_selector)) { + foreach ($content->find($cleanup_selector) as $item_to_clean) { + $item_to_clean->outertext = ''; + } + } + + if ($string_convert) { + $content = $content->outertext; + } + return $content; + } + + /** + * Retrieve first N link+title+truncated-content from webpage URL or DOM satisfying the specified criteria + * @param string|object $page URL or DOM to retrieve feed items from * @param string $url_selector DOM selector for matching links or their parent element * @param string $url_pattern Optional filter to keep only links matching the pattern * @param int $limit Optional maximum amount of URLs to return - * @return array of minimal feed items {'uri': entry_url, 'title', entry_title} + * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments" + * @return array of items {'uri': entry_url, 'title': entry_title, ['content': when present in DOM] } */ - protected function htmlFindLinks($page, $url_selector, $url_pattern = '', $limit = 0) + protected function htmlFindEntries($page, $url_selector, $url_pattern = '', $limit = 0, $content_cleanup = null) { + if (is_string($page)) { + $page = getSimpleHTMLDOM($page); + } + $links = $page->find($url_selector); if (empty($links)) { returnClientError('No results for URL selector'); } - $link_to_title = []; + $link_to_item = []; foreach ($links as $link) { + $item = []; + if ($link->innertext != $link->plaintext) { + $item['content'] = $link->innertext; + } if ($link->tag != 'a') { $link = $link->find('a', 0); } - $link_to_title[$link->href] = $link->plaintext; + $item['uri'] = $link->href; + $item['title'] = $link->plaintext; + if (isset($item['content'])) { + $item['content'] = convertLazyLoading($item['content']); + $item['content'] = defaultLinkTo($item['content'], $item['uri']); + $item['content'] = $this->cleanArticleContent($item['content'], $content_cleanup); + } + $link_to_item[$link->href] = $item; } - $links = $this->filterUrlList(array_keys($link_to_title), $url_pattern, $limit); + $links = $this->filterUrlList(array_keys($link_to_item), $url_pattern, $limit); if (empty($links)) { returnClientError('No results for URL pattern'); @@ -158,10 +217,7 @@ class CssSelectorBridge extends BridgeAbstract $items = []; foreach ($links as $link) { - $item = []; - $item['uri'] = $link; - $item['title'] = $link_to_title[$link]; - $items[] = $item; + $items[] = $link_to_item[$link]; } return $items; @@ -173,9 +229,10 @@ class CssSelectorBridge extends BridgeAbstract * @param string $content_selector HTML selector for extracting content, e.g. "article.content" * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments" * @param string $title_cleanup Optional string to remove from article title, e.g. " | BlogName" + * @param string $title_default Optional title to use when could not extract title reliably * @return array Entry data: uri, title, content */ - protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null) + protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null, $title_default = null) { if (empty($content_selector)) { returnClientError('Please specify a content selector'); @@ -190,18 +247,18 @@ class CssSelectorBridge extends BridgeAbstract returnClientError('Could not find content selector at URL: ' . $entry_url); } - if (!empty($content_cleanup)) { - foreach ($article_content->find($content_cleanup) as $item_to_clean) { - $item_to_clean->outertext = ''; - } - } - $article_content = convertLazyLoading($article_content); $article_content = defaultLinkTo($article_content, $entry_url); + $article_content = $this->cleanArticleContent($article_content, $content_cleanup); + + $article_title = $this->getPageTitle($entry_html, $title_cleanup); + if (!empty($title_default) && (empty($article_title) || $article_title === $this->feedName)) { + $article_title = $title_default; + } $item = []; $item['uri'] = $entry_url; - $item['title'] = $this->getPageTitle($entry_html, $title_cleanup); + $item['title'] = $article_title; $item['content'] = $article_content; return $item; }