From f259fa7f9f601431c9e2481d1bd546db46e945ef Mon Sep 17 00:00:00 2001 From: MarKoeh <75181140+Mar-Koeh@users.noreply.github.com> Date: Mon, 10 Jan 2022 11:47:49 +0100 Subject: [PATCH] [ARDMediathekBridge] Switch to JSON-API (#2380) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Switch ARDMediathekBridge to JSON-API The html screen scraping approach of ARDMediathekBridge did not work reliably. I could not find one show for which the item list was not empty using the html screen scraping approach. The proposed change uses the JSON-API of the WebApp. Although there is still room for improvement (feed title, better understanding of the API, more accurate mimic of the webapp's behavior, de-pagination …), it does work with this change. Indicate that now full URLs as well as just the ID are accepted. --- bridges/ARDMediathekBridge.php | 79 ++++++++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 12 deletions(-) diff --git a/bridges/ARDMediathekBridge.php b/bridges/ARDMediathekBridge.php index 5ac8a41d..e3c9967a 100644 --- a/bridges/ARDMediathekBridge.php +++ b/bridges/ARDMediathekBridge.php @@ -4,14 +4,48 @@ class ARDMediathekBridge extends BridgeAbstract { const URI = 'https://www.ardmediathek.de'; const DESCRIPTION = 'Feed of any series in the ARD-Mediathek, specified by its path'; const MAINTAINER = 'yue-dongchen'; + /* + * Number of Items to be requested from ARDmediathek API + * 12 has been observed on the wild + * 29 is the highest successfully tested value + * More Items could be fetched via pagination + * The JSON-field pagination holds more information on that + * @const PAGESIZE number of requested items + */ + const PAGESIZE = 29; + /* + * The URL Prefix of the (Webapp-)API + * @const APIENDPOINT https-URL of the used endpoint + */ + const APIENDPOINT = 'https://api.ardmediathek.de/page-gateway/widgets/ard/asset/'; + /* + * The URL prefix of the video link + * URLs from the webapp include a slug containing titles of show, episode, and tv station. + * It seems to work without that. + * @const VIDEOLINKPREFIX https-URL prefix of video links + */ + const VIDEOLINKPREFIX = 'https://www.ardmediathek.de/video/'; + /* + * The requested width of the preview image + * 432 has been observed on the wild + * The webapp seems to also compute and add the height value + * It seems to works without that. + * @const IMAGEWIDTH width in px of the preview image + */ + const IMAGEWIDTH = 432; + /* + * Placeholder that will be replace by IMAGEWIDTH in the preview image URL + * @const IMAGEWIDTHPLACEHOLDER + */ + const IMAGEWIDTHPLACEHOLDER = '{width}'; const PARAMETERS = array( array( 'path' => array( - 'name' => 'Path', + 'name' => 'Show Link or ID', 'required' => true, - 'title' => 'Enter without trailing slash', - 'defaultValue' => '45-min/Y3JpZDovL25kci5kZS8xMzkx' + 'title' => 'Link to the show page or just its alphanumeric suffix', + 'defaultValue' => 'https://www.ardmediathek.de/sendung/45-min/Y3JpZDovL25kci5kZS8xMzkx/' ) ) ); @@ -19,17 +53,38 @@ class ARDMediathekBridge extends BridgeAbstract { public function collectData() { date_default_timezone_set('Europe/Berlin'); - $url = 'https://www.ardmediathek.de/sendung/' . $this->getInput('path') . '/'; - $html = getSimpleHTMLDOM($url); - $html = defaultLinkTo($html, $url); + $pathComponents = explode('/', $this->getInput('path')); + if (empty($pathComponents)) { + returnClientError('Path may not be empty'); + } + if (count($pathComponents) < 2) { + $showID = $pathComponents[0]; + } else { + $lastKey = count($pathComponents) - 1; + $showID = $pathComponents[$lastKey]; + if (strlen($showID) === 0) { + $showID = $pathComponents[$lastKey - 1]; + } + } - foreach($html->find('a.Root-sc-1ytw7qu-0') as $video) { + $url = SELF::APIENDPOINT . $showID . '/?pageSize=' . SELF::PAGESIZE; + $rawJSON = getContents($url); + $processedJSON = json_decode($rawJSON); + + foreach($processedJSON->teasers as $video) { $item = array(); - $item['uri'] = $video->href; - $item['title'] = $video->find('h3', 0)->plaintext; - $item['content'] = ''; - $item['timestamp'] = strtotime(mb_substr($video->find('div.Line-epbftj-1', 0)->plaintext, 0, 10)); - + // there is also ->links->self->id, ->links->self->urlId, ->links->target->id, ->links->target->urlId + $item['uri'] = SELF::VIDEOLINKPREFIX . $video->id . '/'; + // there is also ->mediumTitle and ->shortTitle + $item['title'] = $video->longTitle; + // in the test, aspect16x9 was the only child of images, not sure whether that is always true + $item['enclosures'] = array( + str_replace(SELF::IMAGEWIDTHPLACEHOLDER, SELF::IMAGEWIDTH, $video->images->aspect16x9->src) + ); + $item['content'] = '

'; + $item['timestamp'] = $video->broadcastedOn; + $item['uid'] = $video->id; + $item['author'] = $video->publicationService->name; $this->items[] = $item; } }