[ 'name' => 'Domain to use', 'required' => true, 'defaultValue' => self::DEFAULT_DOMAIN ], 'page' => [ 'name' => 'Initial page to load', 'required' => true, 'exampleValue' => 'franceinter/podcasts/burne-out' ] ]]; private function getDomain() { $domain = $this->getInput('domain'); if (empty($domain)) { $domain = self::DEFAULT_DOMAIN; } if (strpos($domain, '://') === false) { $domain = 'https://' . $domain; } return $domain; } public function getURI() { return $this->getDomain() . '/' . $this->getInput('page'); } public function collectData() { $html = getSimpleHTMLDOM($this->getURI()); // An array of dom nodes $documentsList = $html->find('.DocumentsList', 0); $documentsListWrapper = $documentsList->find('.DocumentsList-wrapper', 0); $cardList = $documentsListWrapper->find('.CardMedia'); foreach ($cardList as $card) { $item = []; $title_link = $card->find('.ConceptTitle a', 0); $item['title'] = $title_link->plaintext; $uri = $title_link->getAttribute('href', 0); switch (substr($uri, 0, 1)) { case 'h': // absolute uri $item['uri'] = $uri; break; case '/': // domain relative uri $item['uri'] = $this->getDomain() . $uri; break; default: $item['uri'] = $this->getDomain() . '/' . $uri; } // Finally, obtain the mp3 from some weird Radio France API (url obtained by reading network calls, no less) $media_url = self::APIENDPOINT . '?value=' . $uri; $rawJSON = getSimpleHTMLDOMCached($media_url); $processedJSON = json_decode($rawJSON); $model_content = $processedJSON->content; if (empty($model_content->manifestations)) { error_log("Seems like $uri has no manifestation"); } else { $item['enclosures'] = [ $model_content->manifestations[0]->url ]; $item['content'] = ''; if (isset($model_content->visual)) { $item['content'] .= "visual->src}\" alt=\"{$model_content->visual->legend}\" style=\"float:left; width:400px; margin: 1em;\"/>"; } if (isset($model_content->standFirst)) { $item['content'] .= $model_content->standFirst; } if (isset($model_content->bodyJson)) { if (!empty($item['content'])) { $item['content'] .= '
'; } $pseudo_html_array = array_map([$this, 'convertJsonElementToHTML'], $model_content->bodyJson); $pseudo_html_text = array_reduce( $pseudo_html_array, function ($text, $element) { return $text . "\n" . $element; }, '' ); $item['content'] .= $pseudo_html_text; } if (isset($model_content->producers)) { $item['author'] = $this->readAuthorsNamesFrom($model_content->producers); } elseif (isset($model_content->staff)) { $item['author'] = $this->readAuthorsNamesFrom($model_content->staff); } $time = $card->find('time', 0); $timevalue = $time->getAttribute('datetime'); $item['timestamp'] = strtotime($timevalue); $this->items[] = $item; } } } private function readAuthorsNamesFrom($persons_array) { $persons_names = array_map(function ($person_element) { return $person_element->name; }, $persons_array); return array_reduce($persons_names, function ($a, $b) { if (!empty($a)) { $a .= ', '; } return $a . $b; }, ''); } private function convertJsonElementToHTML($jsonElement) { $childText = isset($jsonElement->children) ? $this->convertJsonChildrenToHTML($jsonElement->children) : ''; $valueText = isset($jsonElement->value) ? $jsonElement->value : ''; switch ($jsonElement->type) { case 'text': return "{$childText}{$valueText}"; case 'heading': $level = $jsonElement->level; return "{$childText}{$valueText}"; case 'list': $tag = 'ul'; if (isset($jsonElement->ordered)) { if ($jsonElement->ordered) { $tag = 'ol'; } } return "<$tag>\n" . $childText . "\n"; case 'list_item': return "
  • {$childText}{$valueText}
  • \n"; case 'bounce': return ''; case 'paragraph': return "

    {$childText}{$valueText}

    \n"; case 'quote': return "
    {$childText}{$valueText}
    \n"; case 'link': return "data->href}\">{$childText}{$valueText}\n"; case 'audio': return ''; case 'embed': return $jsonElement->data->html; default: return $jsonElement->value; } } private function convertJsonChildrenToHTML($children) { $converted = array_map([$this, 'convertJsonElementToHTML'], $children); return array_reduce($converted, function ($a, $b) { return $a . $b; }, ''); } private function removeAds($element) { $ads = $element->find('AdSlot'); foreach ($ads as $ad) { $ad->remove(); } return $element; } /** * Replaces all relative URIs with absolute ones * @param $element A simplehtmldom element * @return The $element->innertext with all URIs replaced */ private function replaceUriInHtmlElement($element) { $returned = $element->innertext; foreach (self::REPLACED_ATTRIBUTES as $initial => $final) { $returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned); } return $returned; } }