[ 'name' => 'Topic', 'type' => 'list', 'values' => [ 'All articles' => '', 'Tech' => 'tech', 'Money' => 'personal-finance', 'Home' => 'home', 'Wellness' => 'health', 'Energy' => 'home/energy-and-utilities', 'Deals' => 'deals', 'Computing' => 'tech/computing', 'Mobile' => 'tech/mobile', 'Science' => 'science', 'Services' => 'tech/services-and-software' ] ], 'limit' => self::LIMIT ] ]; public function collectData() { $topic = $this->getInput('topic'); $limit = $this->getInput('limit'); $limit = empty($limit) ? 10 : $limit; $url_pattern = empty($topic) ? '' : self::URI . $topic; $sitemap_latest = self::URI . 'sitemaps/article/' . date('Y/m') . '.xml'; $sitemap_previous = self::URI . 'sitemaps/article/' . date('Y/m', strtotime('last day of previous month')) . '.xml'; $links = array_merge( $this->sitemapXmlToList($this->getSitemapXml($sitemap_latest, true), $url_pattern, $limit), $this->sitemapXmlToList($this->getSitemapXml($sitemap_previous, true), $url_pattern, $limit) ); if ($limit > 0 && count($links) > $limit) { $links = array_slice($links, 0, $limit); } if (empty($links)) { returnClientError('Failed to retrieve article list'); } foreach ($links as $article_uri) { $article_dom = convertLazyLoading(getSimpleHTMLDOMCached($article_uri)); $title = trim($article_dom->find('h1', 0)->plaintext); $author = $article_dom->find('span.c-assetAuthor_name', 0)->plaintext; $headline = $article_dom->find('p.c-contentHeader_description', 0); $content = $article_dom->find('div.c-pageArticle_content, div.single-article__content, div.article-main-body', 0); $date = null; $enclosure = null; foreach ($article_dom->find('script[type=application/ld+json]') as $ldjson) { $datePublished = extractFromDelimiters($ldjson->innertext, '"datePublished":"', '"'); if ($datePublished !== false) { $date = strtotime($datePublished); } $imageObject = extractFromDelimiters($ldjson->innertext, 'ImageObject","url":"', '"'); if ($imageObject !== false) { $enclosure = $imageObject; } } foreach ($content->find('div.c-shortcodeGallery') as $cleanup) { $cleanup->outertext = ''; } foreach ($content->find('figure') as $figure) { $img = $figure->find('img', 0); if ($img) { $figure->outertext = $img->outertext; } } $content = $content->innertext; if ($enclosure) { $content = "
" . $content; } if ($headline) { $content = '

' . $headline->plaintext . '


' . $content; } $item = []; $item['uri'] = $article_uri; $item['title'] = $title; $item['author'] = $author; $item['content'] = $content; if (!is_null($date)) { $item['timestamp'] = $date; } if (!is_null($enclosure)) { $item['enclosures'] = [$enclosure]; } $this->items[] = $item; } } }