From 3557e5ffd4c2024c36d6a448c520cb9afd5cc2aa Mon Sep 17 00:00:00 2001 From: ORelio Date: Sat, 30 Sep 2023 15:03:52 +0200 Subject: [PATCH] [CssSelector/Sitemap] Minor fixes (#3719) - Apply title_cleanup to title from metadata (#3717) - Metadata: Fix ld+json object/array confusion - Sitemap: Also try /sitemap.xml well known url --- bridges/CssSelectorBridge.php | 24 ++++++++++++++++++------ bridges/SitemapBridge.php | 10 ++++++++-- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/bridges/CssSelectorBridge.php b/bridges/CssSelectorBridge.php index c5a09822..dd8fe228 100644 --- a/bridges/CssSelectorBridge.php +++ b/bridges/CssSelectorBridge.php @@ -91,7 +91,7 @@ class CssSelectorBridge extends BridgeAbstract $limit = $this->getInput('limit') ?? 10; $html = defaultLinkTo(getSimpleHTMLDOM($url), $url); - $this->feedName = $this->getPageTitle($html, $title_cleanup); + $this->feedName = $this->titleCleanup($this->getPageTitle($html), $title_cleanup); $items = $this->htmlFindEntries($html, $url_selector, $url_pattern, $limit, $content_cleanup); if (empty($content_selector)) { @@ -139,17 +139,27 @@ class CssSelectorBridge extends BridgeAbstract /** * Retrieve title from webpage URL or DOM * @param string|object $page URL or DOM to retrieve title from - * @param string $title_cleanup optional string to remove from webpage title, e.g. " | BlogName" * @return string Webpage title */ - protected function getPageTitle($page, $title_cleanup = null) + protected function getPageTitle($page) { if (is_string($page)) { $page = getSimpleHTMLDOMCached($page); } $title = html_entity_decode($page->find('title', 0)->plaintext); - if (!empty($title)) { - $title = trim(str_replace($title_cleanup, '', $title)); + return $title; + } + + /** + * Clean Article title. Remove constant part that appears in every title such as blog name. + * @param string $title Title to clean, e.g. "Article Name | BlogName" + * @param string $title_cleanup string to remove from webpage title, e.g. " | BlogName" + * @return string Cleaned Title + */ + protected function titleCleanup($title, $title_cleanup) + { + if (!empty($title) && !empty($title_cleanup)) { + return trim(str_replace($title_cleanup, '', $title)); } return $title; } @@ -270,6 +280,8 @@ class CssSelectorBridge extends BridgeAbstract $item['title'] = $article_title; } + $item['title'] = $this->titleCleanup($item['title'], $title_cleanup); + $article_content = $entry_html->find($content_selector); if (!empty($article_content)) { @@ -484,7 +496,7 @@ class CssSelectorBridge extends BridgeAbstract // Now we can check for desired field in JSON and populate $item accordingly if (isset($json_root[$field])) { $field_value = $json_root[$field]; - if (is_array($field_value)) { + if (is_array($field_value) && isset($field_value[0])) { $field_value = $field_value[0]; // Different versions of the same enclosure? Take the first one } if (is_string($field_value) && !empty($field_value)) { diff --git a/bridges/SitemapBridge.php b/bridges/SitemapBridge.php index 482cbb66..78526e6e 100644 --- a/bridges/SitemapBridge.php +++ b/bridges/SitemapBridge.php @@ -73,7 +73,7 @@ class SitemapBridge extends CssSelectorBridge $discard_thumbnail = $this->getInput('discard_thumbnail'); $limit = $this->getInput('limit'); - $this->feedName = $this->getPageTitle($url, $title_cleanup); + $this->feedName = $this->titleCleanup($this->getPageTitle($url), $title_cleanup); $sitemap_url = empty($site_map) ? $url : $site_map; $sitemap_xml = $this->getSitemapXml($sitemap_url, !empty($site_map)); $links = $this->sitemapXmlToList($sitemap_xml, $url_pattern, empty($limit) ? 10 : $limit); @@ -103,7 +103,13 @@ class SitemapBridge extends CssSelectorBridge $robots_txt = getSimpleHTMLDOM(urljoin($url, '/robots.txt'))->outertext; preg_match('/Sitemap: ([^ ]+)/', $robots_txt, $matches); if (empty($matches)) { - returnClientError('Failed to determine Sitemap from robots.txt. Try setting it manually.'); + $sitemap = getSimpleHTMLDOM(urljoin($url, '/sitemap.xml')); + if (!empty($sitemap->find('urlset, sitemap'))) { + $url = urljoin($url, '/sitemap.xml'); + return $sitemap; + } else { + returnClientError('Failed to locate Sitemap from /robots.txt or /sitemap.xml. Try setting it manually.'); + } } $url = $matches[1]; }