From 1ddce120aeac1bc33f03bcfe5a343a8eed71c770 Mon Sep 17 00:00:00 2001 From: Bockiii Date: Sun, 26 Sep 2021 13:25:19 +0200 Subject: [PATCH] [EconomistBridge] Full rework (#2272) --- bridges/EconomistBridge.php | 175 +++++++++++++++++++++++------------- 1 file changed, 115 insertions(+), 60 deletions(-) diff --git a/bridges/EconomistBridge.php b/bridges/EconomistBridge.php index b58c6672..d426946b 100644 --- a/bridges/EconomistBridge.php +++ b/bridges/EconomistBridge.php @@ -1,70 +1,125 @@ array( + 'limit' => array( + 'name' => 'Feed Item Limit', + 'required' => true, + 'type' => 'number', + 'defaultValue' => 10, + 'title' => 'Maximum number of returned feed items. Maximum 30, default 10' + ) + ), + 'Topics' => array( + 'topic' => array( + 'name' => 'Topics', + 'type' => 'list', + 'title' => 'Select a Topic', + 'defaultValue' => 'latest', + 'values' => array( + 'Latest' => 'latest', + 'The world this week' => 'the-world-this-week', + 'Letters' => 'letters', + 'Leaders' => 'leaders', + 'Briefings' => 'briefing', + 'Special reports' => 'special-report', + 'Britain' => 'britain', + 'Europe' => 'europe', + 'United States' => 'united-states', + 'The Americas' => 'the-americas', + 'Middle East and Africa' => 'middle-east-and-africa', + 'Asia' => 'asia', + 'China' => 'china', + 'International' => 'international', + 'Business' => 'business', + 'Finance and economics' => 'finance-and-economics', + 'Science and technology' => 'science-and-technology', + 'Books and arts' => 'books-and-arts', + 'Obituaries' => 'obituary', + 'Graphic detail' => 'graphic-detail', + 'Indicators' => 'economic-and-financial-indicators', + ) + ) + ), + 'Blogs' => array( + 'blog' => array( + 'name' => 'Blogs', + 'type' => 'list', + 'title' => 'Select a Blog', + 'values' => array( + 'Bagehots notebook' => 'bagehots-notebook', + 'Bartleby' => 'bartleby', + 'Buttonwoods notebook' => 'buttonwoods-notebook', + 'Charlemagnes notebook' => 'charlemagnes-notebook', + 'Democracy in America' => 'democracy-in-america', + 'Erasmus' => 'erasmus', + 'Free exchange' => 'free-exchange', + 'Game theory' => 'game-theory', + 'Gulliver' => 'gulliver', + 'Kaffeeklatsch' => 'kaffeeklatsch', + 'Prospero' => 'prospero', + 'The Economist Explains' => 'the-economist-explains', + ) + ) + ) + ); + + public function collectData(){ + // get if topics or blogs were selected and store the selected category + switch ($this->queriedContext) { + case 'Topics': + $category = $this->getInput('topic'); + break; + case 'Blogs': + $category = $this->getInput('blog'); + break; + default: + $category = 'latest'; + } + // limit the returned articles to 30 at max + if ((int)$this->getInput('limit') <= 30) { + $limit = (int)$this->getInput('limit'); + } else { + $limit = 30; + } + + $this->collectExpandableDatas('https://www.economist.com/' . $category . '/rss.xml', $limit); } - public function collectData() { - $html = getSimpleHTMLDOM(self::URI . '/latest/') - or returnServerError('Could not fetch latest updates form The Economist.'); + protected function parseItem($feedItem){ + $item = parent::parseItem($feedItem); - foreach($html->find('div.teaser') as $element) { + $article = getSimpleHTMLDOM($item['uri']) + or returnServerError('Could not request Site: ' . $item['title']); + // before the article can be added, it needs to be cleaned up, thus, the extra function + $item['content'] = $this->cleanContent($article); + // only the article lead image is retained + $item['enclosures'][] = $article->find('div.article__lead-image', 0)->find('img', 0)->getAttribute('src'); - $a = $element->find('a.headline-link', 0); - $href = $a->href; + return $item; + } - if (substr($href, 0, 4) != 'http') - $href = self::URI . $a->href; - - $full = getSimpleHTMLDOMCached($href); - $article = $full->find('article', 0); - $header = $article->find('span[itemprop="headline"]', 0); - $headerimg = $article->find('div[itemprop="image"]', 0)->find('img', 0); - $author = $article->find('p[itemprop="byline"]', 0); - $time = $article->find('time', 0); - $content = $article->find('div[itemprop="text"]', 0); - $section = array( $article->find('strong[itemprop="articleSection"]', 0)->plaintext ); - - // Author - if ($author) - $author = substr($author->innertext, 3, strlen($author)); - else - $author = 'The Economist'; - - // Remove newsletter subscription box - $newsletter = $content->find('div[class="newsletter-form__message"]', 0); - if ($newsletter) - $newsletter->outertext = ''; - - $newsletterForm = $content->find('form', 0); - if ($newsletterForm) - $newsletterForm->outertext = ''; - - // Remove next and previous article URLs at the bottom - $nextprev = $content->find('div[class="blog-post__next-previous-wrapper"]', 0); - if ($nextprev) - $nextprev->outertext = ''; - - $item = array(); - $item['title'] = $header->innertext; - $item['uri'] = $href; - $item['timestamp'] = strtotime($time->datetime); - $item['author'] = $author; - $item['categories'] = $section; - - $item['content'] = '' . $content->innertext; - - $this->items[] = $item; - - if (count($this->items) >= 10) - break; + private function cleanContent($article){ + // the actual article is in this div + $content = $article->find('div.layout-article-body', 0)->innertext; + // clean the article content. Remove all div's since the text is in paragraph elements + foreach (array( + '
/i', '', $content); + // fix the relative links + $content = defaultLinkTo($content, $this->getURI()); + + return $content; } }