diff --git a/bridges/SchweinfurtBuergerinformationenBridge.php b/bridges/SchweinfurtBuergerinformationenBridge.php new file mode 100644 index 00000000..1cee949a --- /dev/null +++ b/bridges/SchweinfurtBuergerinformationenBridge.php @@ -0,0 +1,121 @@ + array( + 'name' => 'Number of pages', + 'type' => 'number', + 'title' => 'Specifies the number of pages to fetch. Usually one or two are enough.', + 'exampleValue' => '1', + 'defaultValue' => '1', + ) + ) + ); + + public function getIcon() + { + return 'https://www.schweinfurt.de/__/images/favicon.ico'; + } + + public function collectData() + { + // Get number of pages to retrieve. One page is the minimum. + $pages = $this->getInput('pages'); + if (!is_int($pages) || $pages < 1) + $pages = 1; + + $articleIDs = array(); + + for($page = 0; $page < $pages; $page++) { + $newIDs = $this->getArticleIDsFromPage($page); + $articleIDs = array_merge($articleIDs, $newIDs); + } + + foreach($articleIDs as $articleID) { + $this->items[] = $this->generateItemFromArticle($articleID); + + if (Debug::isEnabled()) + break; + } + } + + private function getArticleIDsFromPage($page) + { + $url = sprintf(self::URI . '?art_pager=%d', $page); + $html = getSimpleHTMLDOMCached($url, self::INDEX_CACHE_TIMEOUT) + or returnServerError('Could not retrieve ' . $url); + + $articles = $html->find('div.artikel-uebersicht'); + $articleIDs = array(); + + foreach($articles as $article) { + // The article ID is in the 'id' attribute of the div element, prefixed with 'artikel_id_' + if (preg_match('/artikel_id_(\d+)/', $article->id, $match)) { + $articleIDs[] = $match[1]; + } else + returnServerError('Couldn\'t determine article ID from index page.'); + } + + return $articleIDs; + } + + private function generateItemFromArticle($id) + { + $url = sprintf(self::ARTICLE_URI, $id); + $html = getSimpleHTMLDOMCached($url, self::ARTICLE_CACHE_TIMEOUT) + or returnServerError('Could not retrieve ' . $url); + + $div = $html->find('div#artikel-detail', 0); + $divContent = $div->find('.c-content', 0); + $images = $divContent->find('img'); + + // Every external link has a little arrow symbol image attached to it. + // Remove this image. This has to be done before building $content. + foreach($images as $image) + if ($image->class == 'imgextlink') + $image->outertext = ''; + + $title = $div->find('.c-title', 0)->innertext; + $teaser = $div->find('.c-teaser', 0)->innertext; + $content = $divContent->innertext; + + // The title can contain HTML entities. These can be converted back + // to regular UTF-8 characters. + $title = html_entity_decode($title, ENT_HTML5, 'UTF-8'); + + // If there's a teaser, make it more eye-catching, + // so that it is clear, that this is not part of the actual content. + if (strlen(trim($teaser)) > 0) + $content = '' . $teaser . '' . $content; + + $item = array( + 'uri' => $url, + 'title' => $title, + 'content' => $content, + 'uid' => $id, + ); + + // Let's see if there are images in the content, and if yes, attach + // them as enclosures, but not images which are used for linking to an external site. + foreach($images as $image) + if ($image->class != 'imgextlink') + $item['enclosures'][] = $image->src; + + // Get the date of the article. Example: "zuletzt geändert: 26.05.2020" + $editDate = $div->find('div#edit', 0)->plaintext; + $editDate = substr($editDate, strrpos($editDate, ' ') + 1); + $editDate = DateTime::createFromFormat('d.m.Y', $editDate); + + if ($editDate !== false) + $item['timestamp'] = $editDate->getTimestamp(); + + return $item; + } +}