[ARDMediathekBridge] Switch to JSON-API (#2380)

* Switch ARDMediathekBridge to JSON-API

The html screen scraping approach of ARDMediathekBridge did not work reliably. I could not find one show for which the item list was not empty using the html screen scraping approach.

The proposed change uses the JSON-API of the WebApp. Although there is still room for improvement (feed title, better understanding of the API, more accurate mimic of the webapp's behavior, de-pagination …), it does work with this change.

Indicate that now full URLs as well as just the ID are accepted.
This commit is contained in:
MarKoeh 2022-01-10 11:47:49 +01:00 committed by GitHub
parent 368a198321
commit f259fa7f9f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 67 additions and 12 deletions

View File

@ -4,14 +4,48 @@ class ARDMediathekBridge extends BridgeAbstract {
const URI = 'https://www.ardmediathek.de';
const DESCRIPTION = 'Feed of any series in the ARD-Mediathek, specified by its path';
const MAINTAINER = 'yue-dongchen';
/*
* Number of Items to be requested from ARDmediathek API
* 12 has been observed on the wild
* 29 is the highest successfully tested value
* More Items could be fetched via pagination
* The JSON-field pagination holds more information on that
* @const PAGESIZE number of requested items
*/
const PAGESIZE = 29;
/*
* The URL Prefix of the (Webapp-)API
* @const APIENDPOINT https-URL of the used endpoint
*/
const APIENDPOINT = 'https://api.ardmediathek.de/page-gateway/widgets/ard/asset/';
/*
* The URL prefix of the video link
* URLs from the webapp include a slug containing titles of show, episode, and tv station.
* It seems to work without that.
* @const VIDEOLINKPREFIX https-URL prefix of video links
*/
const VIDEOLINKPREFIX = 'https://www.ardmediathek.de/video/';
/*
* The requested width of the preview image
* 432 has been observed on the wild
* The webapp seems to also compute and add the height value
* It seems to works without that.
* @const IMAGEWIDTH width in px of the preview image
*/
const IMAGEWIDTH = 432;
/*
* Placeholder that will be replace by IMAGEWIDTH in the preview image URL
* @const IMAGEWIDTHPLACEHOLDER
*/
const IMAGEWIDTHPLACEHOLDER = '{width}';
const PARAMETERS = array(
array(
'path' => array(
'name' => 'Path',
'name' => 'Show Link or ID',
'required' => true,
'title' => 'Enter without trailing slash',
'defaultValue' => '45-min/Y3JpZDovL25kci5kZS8xMzkx'
'title' => 'Link to the show page or just its alphanumeric suffix',
'defaultValue' => 'https://www.ardmediathek.de/sendung/45-min/Y3JpZDovL25kci5kZS8xMzkx/'
)
)
);
@ -19,17 +53,38 @@ class ARDMediathekBridge extends BridgeAbstract {
public function collectData() {
date_default_timezone_set('Europe/Berlin');
$url = 'https://www.ardmediathek.de/sendung/' . $this->getInput('path') . '/';
$html = getSimpleHTMLDOM($url);
$html = defaultLinkTo($html, $url);
$pathComponents = explode('/', $this->getInput('path'));
if (empty($pathComponents)) {
returnClientError('Path may not be empty');
}
if (count($pathComponents) < 2) {
$showID = $pathComponents[0];
} else {
$lastKey = count($pathComponents) - 1;
$showID = $pathComponents[$lastKey];
if (strlen($showID) === 0) {
$showID = $pathComponents[$lastKey - 1];
}
}
foreach($html->find('a.Root-sc-1ytw7qu-0') as $video) {
$url = SELF::APIENDPOINT . $showID . '/?pageSize=' . SELF::PAGESIZE;
$rawJSON = getContents($url);
$processedJSON = json_decode($rawJSON);
foreach($processedJSON->teasers as $video) {
$item = array();
$item['uri'] = $video->href;
$item['title'] = $video->find('h3', 0)->plaintext;
$item['content'] = '<img src="' . $video->find('img', 0)->src . '" />';
$item['timestamp'] = strtotime(mb_substr($video->find('div.Line-epbftj-1', 0)->plaintext, 0, 10));
// there is also ->links->self->id, ->links->self->urlId, ->links->target->id, ->links->target->urlId
$item['uri'] = SELF::VIDEOLINKPREFIX . $video->id . '/';
// there is also ->mediumTitle and ->shortTitle
$item['title'] = $video->longTitle;
// in the test, aspect16x9 was the only child of images, not sure whether that is always true
$item['enclosures'] = array(
str_replace(SELF::IMAGEWIDTHPLACEHOLDER, SELF::IMAGEWIDTH, $video->images->aspect16x9->src)
);
$item['content'] = '<img src="' . $item['enclosures'][0] . '" /><p>';
$item['timestamp'] = $video->broadcastedOn;
$item['uid'] = $video->id;
$item['author'] = $video->publicationService->name;
$this->items[] = $item;
}
}