From cf7e3eea5612f3466f297883395723e5b325fd1e Mon Sep 17 00:00:00 2001 From: Scott Colby Date: Fri, 15 Sep 2023 17:41:08 -0400 Subject: [PATCH] Add DeutscheWelle FeedExpander bridge. (#3673) * [DeutscheWelle] Add DeutscheWelle FeedExpander bridge. * [DeutscheWelle] Fix linting errors. --- bridges/DeutscheWelleBridge.php | 143 ++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 bridges/DeutscheWelleBridge.php diff --git a/bridges/DeutscheWelleBridge.php b/bridges/DeutscheWelleBridge.php new file mode 100644 index 00000000..2e10d670 --- /dev/null +++ b/bridges/DeutscheWelleBridge.php @@ -0,0 +1,143 @@ + [ + 'name' => 'feed', + 'type' => 'list', + 'values' => [ + 'All Top Stories and News Updates' + => 'http://rss.dw.com/atom/rss-en-all', + 'Top Stories' + => 'http://rss.dw.com/atom/rss-en-top', + 'Germany' + => 'http://rss.dw.com/atom/rss-en-ger', + 'World' + => 'http://rss.dw.com/atom/rss-en-world', + 'Europe' + => 'http://rss.dw.com/atom/rss-en-eu', + 'Business' + => 'http://rss.dw.com/atom/rss-en-bus', + 'Science' + => 'http://rss.dw.com/atom/rss_en_science', + 'Environment' + => 'http://rss.dw.com/atom/rss_en_environment', + 'Culture & Lifestyle' + => 'http://rss.dw.com/atom/rss-en-cul', + 'Sports' + => 'http://rss.dw.de/atom/rss-en-sports', + 'Visit Germany' + => 'http://rss.dw.com/atom/rss-en-visitgermany', + 'Asia' + => 'http://rss.dw.com/atom/rss-en-asia', + 'Deutsche Welle Gesamt' + => 'http://rss.dw.com/atom/rss-de-all', + 'Themen des Tages' + => 'http://rss.dw.com/atom/rss-de-top', + 'Nachrichten' + => 'http://rss.dw.com/atom/rss-de-news', + 'Wissenschaft' + => 'http://rss.dw.com/atom/rss-de-wissenschaft', + 'Sport' + => 'http://rss.dw.com/atom/rss-de-sport', + 'Deutschland entdecken' + => 'http://rss.dw.com/atom/rss-de-deutschlandentdecken', + 'Presse' + => 'http://rss.dw.com/atom/presse', + 'Politik' + => 'http://rss.dw.com/atom/rss_de_politik', + 'Wirtschaft' + => 'http://rss.dw.com/atom/rss-de-eco', + 'Kultur & Leben' + => 'http://rss.dw.com/atom/rss-de-cul', + 'Kultur & Leben: Buch' + => 'http://rss.dw.com/atom/rss-de-cul-buch', + 'Kultur & Leben: Film' + => 'http://rss.dw.com/atom/rss-de-cul-film', + 'Kultur & Leben: Musik' + => 'http://rss.dw.com/atom/rss-de-cul-musik', + ] + ] + ]]; + + public function collectData() + { + $this->collectExpandableDatas($this->getInput('feed')); + } + + protected function parseItem($item) + { + $item = parent::parseItem($item); + + $parsedUrl = parse_url($item['uri']); + unset($parsedUrl['query']); + $url = $this->unparseUrl($parsedUrl); + + $page = getSimpleHTMLDOM($url); + $page = defaultLinkTo($page, $url); + + $article = $page->find('article', 0); + + // author + $author = $article->find('.author-link > span', 0); + if ($author) { + $item['author'] = $author->text(); + } + + $teaser = $article->find('.teaser-text', 0); + if (!is_null($teaser)) { + $item['content'] = $teaser->outertext(); + } else { + $item['content'] = ''; + } + + // remove unneeded elements + foreach ( + $article->find( + 'header, .advertisement, [data-tracking-name="sharing-icons-inline"], a.external-link > svg, picture > source, .vjs-wrapper, .dw-widget, footer' + ) as $bad + ) { + $bad->remove(); + } + // reload html as remove() is buggy + $article = str_get_html($article->outertext()); + + // remove width and height values from img tags + foreach ($article->find('img') as $img) { + $img->width = null; + $img->height = null; + } + + // replace lazy-loaded images + foreach ($article->find('figure.placeholder-image') as $figure) { + $img = $figure->find('img', 0); + $img->src = str_replace('${formatId}', '906', $img->getAttribute('data-url')); + $img->style = null; + } + + $item['content'] .= $article->save(); + + return $item; + } + + // https://www.php.net/manual/en/function.parse-url.php#106731 + private function unparseUrl($parsed_url) + { + $scheme = isset($parsed_url['scheme']) ? $parsed_url['scheme'] . '://' : ''; + $host = isset($parsed_url['host']) ? $parsed_url['host'] : ''; + $port = isset($parsed_url['port']) ? ':' . $parsed_url['port'] : ''; + $user = isset($parsed_url['user']) ? $parsed_url['user'] : ''; + $pass = isset($parsed_url['pass']) ? $parsed_url['pass'] : ''; + $pass = ($user || $pass) ? "$pass@" : ''; + $path = isset($parsed_url['path']) ? $parsed_url['path'] : ''; + $query = isset($parsed_url['query']) ? '?' . $parsed_url['query'] : ''; + $fragment = isset($parsed_url['fragment']) ? '#' . $parsed_url['fragment'] : ''; + return "$scheme$user$pass$host$port$path$query$fragment"; + } +}