[CssSelectorBridge] Improvements (#3537) (#3573)

* [CssSelectorBridge] Improvements (#3537)

* Improve parameter documentation / add tooltips
* Allow extracting content from home page instead of article page
* Keep titles from home page when every page <title> is the same

* [CssSelectorBridge] Code linting

* [CssSelectorBridge] Code linting (2)

* [CssSelectorBridge] Code linting (3)
This commit is contained in:
ORelio 2023-07-26 19:41:29 +02:00 committed by GitHub
parent 556bca58cf
commit 977c0db382
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 80 additions and 23 deletions

View File

@ -15,23 +15,40 @@ class CssSelectorBridge extends BridgeAbstract
],
'url_selector' => [
'name' => 'Selector for article links or their parent elements',
'title' => <<<EOT
This bridge works using CSS selectors, e.g. "a.article" will match all <a class="article"
href="URL">TITLE</a> on home page, each one being treated as a feed item. &#10;&#13;
Instead of just a link you can selet one of its parent element. Everything inside that
element becomes feed item content, e.g. image and summary present on home page.
When doing so, the first link inside the selected element becomes feed item URL/Title.
EOT,
'exampleValue' => 'a.article',
'required' => true
],
'url_pattern' => [
'name' => '[Optional] Pattern for site URLs to keep in feed',
'exampleValue' => 'https://example.com/article/.*',
'title' => 'Optionally filter items by applying a regular expression on their URL',
'exampleValue' => '/blog/article/.*',
],
'content_selector' => [
'name' => '[Optional] Selector to extract each article content',
'name' => '[Optional] Selector to expand each article content',
'title' => <<<EOT
When specified, the bridge will fetch each article from its URL
and extract content using the provided selector (Slower!)
EOT,
'exampleValue' => 'article.content',
],
'content_cleanup' => [
'name' => '[Optional] Content cleanup: List of items to remove',
'title' => 'Selector for unnecessary elements to remove inside article contents.',
'exampleValue' => 'div.ads, div.comments',
],
'title_cleanup' => [
'name' => '[Optional] Text to remove from expanded article title',
'title' => <<<EOT
When fetching each article page, feed item title comes from page title.
Specify here some text from page title that need to be removed, e.g. " | BlogName".
EOT,
'exampleValue' => ' | BlogName',
],
'limit' => self::LIMIT
@ -69,7 +86,7 @@ class CssSelectorBridge extends BridgeAbstract
$html = defaultLinkTo(getSimpleHTMLDOM($url), $url);
$this->feedName = $this->getPageTitle($html, $title_cleanup);
$items = $this->htmlFindLinks($html, $url_selector, $url_pattern, $limit);
$items = $this->htmlFindEntries($html, $url_selector, $url_pattern, $limit, $content_cleanup);
if (empty($content_selector)) {
$this->items = $items;
@ -79,7 +96,8 @@ class CssSelectorBridge extends BridgeAbstract
$item['uri'],
$content_selector,
$content_cleanup,
$title_cleanup
$title_cleanup,
$item['title']
);
}
}
@ -127,30 +145,71 @@ class CssSelectorBridge extends BridgeAbstract
}
/**
* Retrieve first N links from webpage URL or DOM satisfying the specified criteria
* @param string|object $page URL or DOM to retrieve links from
* Remove all elements from HTML content matching cleanup selector
* @param string|object $content HTML content as HTML object or string
* @return string|object Cleaned content (same type as input)
*/
protected function cleanArticleContent($content, $cleanup_selector)
{
$string_convert = false;
if (is_string($content)) {
$string_convert = true;
$content = str_get_html($content);
}
if (!empty($cleanup_selector)) {
foreach ($content->find($cleanup_selector) as $item_to_clean) {
$item_to_clean->outertext = '';
}
}
if ($string_convert) {
$content = $content->outertext;
}
return $content;
}
/**
* Retrieve first N link+title+truncated-content from webpage URL or DOM satisfying the specified criteria
* @param string|object $page URL or DOM to retrieve feed items from
* @param string $url_selector DOM selector for matching links or their parent element
* @param string $url_pattern Optional filter to keep only links matching the pattern
* @param int $limit Optional maximum amount of URLs to return
* @return array of minimal feed items {'uri': entry_url, 'title', entry_title}
* @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments"
* @return array of items {'uri': entry_url, 'title': entry_title, ['content': when present in DOM] }
*/
protected function htmlFindLinks($page, $url_selector, $url_pattern = '', $limit = 0)
protected function htmlFindEntries($page, $url_selector, $url_pattern = '', $limit = 0, $content_cleanup = null)
{
if (is_string($page)) {
$page = getSimpleHTMLDOM($page);
}
$links = $page->find($url_selector);
if (empty($links)) {
returnClientError('No results for URL selector');
}
$link_to_title = [];
$link_to_item = [];
foreach ($links as $link) {
$item = [];
if ($link->innertext != $link->plaintext) {
$item['content'] = $link->innertext;
}
if ($link->tag != 'a') {
$link = $link->find('a', 0);
}
$link_to_title[$link->href] = $link->plaintext;
$item['uri'] = $link->href;
$item['title'] = $link->plaintext;
if (isset($item['content'])) {
$item['content'] = convertLazyLoading($item['content']);
$item['content'] = defaultLinkTo($item['content'], $item['uri']);
$item['content'] = $this->cleanArticleContent($item['content'], $content_cleanup);
}
$link_to_item[$link->href] = $item;
}
$links = $this->filterUrlList(array_keys($link_to_title), $url_pattern, $limit);
$links = $this->filterUrlList(array_keys($link_to_item), $url_pattern, $limit);
if (empty($links)) {
returnClientError('No results for URL pattern');
@ -158,10 +217,7 @@ class CssSelectorBridge extends BridgeAbstract
$items = [];
foreach ($links as $link) {
$item = [];
$item['uri'] = $link;
$item['title'] = $link_to_title[$link];
$items[] = $item;
$items[] = $link_to_item[$link];
}
return $items;
@ -173,9 +229,10 @@ class CssSelectorBridge extends BridgeAbstract
* @param string $content_selector HTML selector for extracting content, e.g. "article.content"
* @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments"
* @param string $title_cleanup Optional string to remove from article title, e.g. " | BlogName"
* @param string $title_default Optional title to use when could not extract title reliably
* @return array Entry data: uri, title, content
*/
protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null)
protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null, $title_default = null)
{
if (empty($content_selector)) {
returnClientError('Please specify a content selector');
@ -190,18 +247,18 @@ class CssSelectorBridge extends BridgeAbstract
returnClientError('Could not find content selector at URL: ' . $entry_url);
}
if (!empty($content_cleanup)) {
foreach ($article_content->find($content_cleanup) as $item_to_clean) {
$item_to_clean->outertext = '';
}
}
$article_content = convertLazyLoading($article_content);
$article_content = defaultLinkTo($article_content, $entry_url);
$article_content = $this->cleanArticleContent($article_content, $content_cleanup);
$article_title = $this->getPageTitle($entry_html, $title_cleanup);
if (!empty($title_default) && (empty($article_title) || $article_title === $this->feedName)) {
$article_title = $title_default;
}
$item = [];
$item['uri'] = $entry_url;
$item['title'] = $this->getPageTitle($entry_html, $title_cleanup);
$item['title'] = $article_title;
$item['content'] = $article_content;
return $item;
}