[NationalGeographicBridge] Rewrite bridge (#2177)

- All the option will be preserved.
- Add timestamp, author's name included with full article.
This commit is contained in:
csisoap 2021-10-01 20:39:36 +07:00 committed by GitHub
parent cb111a3ebd
commit 8bcf4ebfbf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 242 additions and 74 deletions

View File

@ -6,11 +6,12 @@ class NationalGeographicBridge extends BridgeAbstract {
const PARAMETER_FULL_ARTICLE = 'full';
const TOPIC_MAGAZINE = 'Magazine';
const TOPIC_LATEST_STORIES = 'Latest Stories';
const CACHE_TIMEOUT = 900; //15 min
const NAME = 'National Geographic';
const URI = 'https://www.nationalgeographic.com/';
const DESCRIPTION = 'Fetches the latest articles from the National Geographic Magazine';
const MAINTAINER = 'logmanoriginal';
const MAINTAINER = 'csisoap';
const PARAMETERS = array(
self::CONTEXT_BY_TOPIC => array(
self::PARAMETER_TOPIC => array(
@ -28,12 +29,22 @@ class NationalGeographicBridge extends BridgeAbstract {
self::PARAMETER_FULL_ARTICLE => array(
'name' => 'Full Article',
'type' => 'checkbox',
'title' => 'Enable to load full articles (takes longer)'
'title' => 'Enable to load full articles and other infos (takes longer)'
)
)
);
private $topicName = '';
const CONTEXT = 'eyJjb250ZW50VHlwZSI6IlVuaXNvbkh1YiIsInZhcmlhYmxlcyI6eyJsb2NhdG9yIjoiL3BhZ2VzL3
RvcGljL2xhdGVzdC1zdG9yaWVzIiwicG9ydGZvbGlvIjoibmF0Z2VvIiwicXVlcn
lUeXBlIjoiTE9DQVRPUiJ9LCJtb2R1bGVJZCI6bnVsbH0';
const LATEST_STORIES_ID = array(
'1df278bb-0e3d-4a67-a0ce-8fae48392822-f2-m1'
);
const MAGAZINE_ID = array(
'94d87d74-f41a-4a32-9acd-b591ba2df288-f2-m1',
'94d87d74-f41a-4a32-9acd-b591ba2df288-f5-m2',
);
public function getURI() {
switch ($this->queriedContext) {
@ -46,9 +57,16 @@ class NationalGeographicBridge extends BridgeAbstract {
}
}
private function getAPIURL($id) {
$context = preg_replace('/\s*/m', '', self::CONTEXT);
$url = 'https://www.nationalgeographic.com/proxy/hub?context='
. $context . '&id=' . $id
. '&moduleType=InfiniteFeedModule&_xhr=pageContent';
return $url;
}
public function collectData() {
$this->topicName = $this->getTopicName($this->getInput(self::PARAMETER_TOPIC));
switch($this->topicName) {
case self::TOPIC_MAGAZINE: {
return $this->collectMagazine();
@ -78,28 +96,35 @@ class NationalGeographicBridge extends BridgeAbstract {
}
private function collectMagazine() {
$uri = $this->getURI();
$stories = array();
$html = getSimpleHTMLDOM($uri)
or returnServerError('Could not request ' . $uri);
foreach(self::MAGAZINE_ID as $id) {
$uri = $this->getAPIURL($id);
$script = $html->find('#lead-component script')[0];
$json_raw = getContents($uri);
$json = json_decode($script->innertext, true);
$json = json_decode($json_raw, true)['tiles'];
$stories = array_merge($json, $stories);
}
// This is probably going to break in the future, fix it then :)
foreach($json['body']['0']['multilayout_promo_beta']['stories'] as $story) {
foreach($stories as $story) {
$this->addStory($story);
}
}
private function collectLatestStories() {
$uri = self::URI . 'latest-stories/_jcr_content/content/hubfeed.promo-hub-feed-all-stories.json';
$stories = array();
$json_raw = getContents($uri)
or returnServerError('Could not request ' . $uri);
foreach(self::LATEST_STORIES_ID as $id) {
$uri = $this->getAPIURL($id);
foreach(json_decode($json_raw, true) as $story) {
$json_raw = getContents($uri);
$json = json_decode($json_raw, true)['tiles'];
$stories = array_merge($stories, $json);
}
foreach($stories as $story) {
$this->addStory($story);
}
}
@ -107,88 +132,231 @@ class NationalGeographicBridge extends BridgeAbstract {
private function addStory($story) {
$title = 'Unknown title';
$content = '';
$story_type = '';
$uri = '';
foreach($story['components'] as $component) {
switch($component['content_type']) {
case 'title': {
$title = $component['title']['text'];
} break;
case 'dek': {
$content = $component['dek']['text'];
} break;
}
foreach($story['ctas'] as $component) {
$uri = $component['url'];
$story_type = $component['icon'];
}
$item = array();
$item['uri'] = $story['uri'];
$item['title'] = $title;
if(isset($story['description'])) {
$content = '<p>' . $story['description'] . '</p>';
}
$title = $story['title'];
$item['uri'] = $uri;
$item['title'] = $story['title'];
// if full article is requested!
if ($this->getInput(self::PARAMETER_FULL_ARTICLE))
$item['content'] = $this->getFullArticle($item['uri']);
else
if ($this->getInput(self::PARAMETER_FULL_ARTICLE)) {
if($story_type != 'interactive') {
/* Nat Geo doesn't provided much info about interactive page
* and it requires JS to load the interactive.
*/
$article_data = $this->getFullArticle($item['uri']);
$item['timestamp'] = $article_data['published_date'];
$item['author'] = $article_data['authors'];
$item['content'] = $content . $article_data['content'];
} else {
$item['content'] = $content;
}
} else
$item['content'] = $content;
if (isset($story['promo_image'])) {
switch($story['promo_image']['content_type']) {
case 'image': {
$item['enclosures'][] = $story['promo_image']['image']['uri'];
} break;
}
}
$image = $story['img'];
$item['enclosures'][] = $image['src'];
if (isset($story['lead_media'])) {
$media = $story['lead_media'];
switch($media['content_type']) {
case 'image': {
// Don't add if promo_image was added
if (empty($item['enclosures']))
$item['enclosures'][] = $media['image']['uri'];
} break;
case 'image_gallery': {
foreach($media['image_gallery']['images'] as $image) {
$item['enclosures'][] = $image['uri'];
}
} break;
}
$tags = $story['tags'];
foreach($tags as $tag) {
$tag_name = $tag['name'];
$item['categories'][] = $tag_name;
}
$this->items[] = $item;
}
private function filterArticleData($data) {
$article_module = array_filter(
$data, function ($item) {
if(isset($item['id']) && $item['id'] == 'natgeo-template1-frame-1') {
return true;
}
}
);
$article_data = array_reduce(
$article_module,
function (array $carry, array $item) {
$module = $item['mods'];
return array_merge(
$carry,
array_filter(
$module, function ($data) {
return $data['id'] == 'natgeo-template1-frame-1-module-1';
}
)
);
},
array()
);
return $article_data[0];
}
private function handleImages($image_module, $image_type) {
$image_alt = '';
$image_credit = '';
$image_src = '';
$image_caption = '';
$caption = '';
switch($image_type) {
case 'image':
case 'imagegroup':
$image = $image_module['image'];
$image_src = $image['src'];
if(isset($image_module['alt'])) {
$image_alt = $image_module['alt'];
} elseif(isset($image['altText'])) {
$image_alt = $image['altText'];
}
if(isset($image['crdt'])) {
$image_credit = $image['crdt'];
}
$caption = (isset($image_module['caption']) ? $image_module['caption'] : '');
break;
case 'photogallery':
$image_credit = (isset($image_module['caption']['credit']) ? $image_module['caption']['credit'] : '');
$caption = $image_module['caption']['text'];
$image_src = $image_module['img']['src'];
$image_alt = $image_module['img']['altText'];
break;
case 'video':
$image_credit = (isset($image_module['credit']) ? $image_module['credit'] : '');
$description = (isset($image_module['description']) ? $image_module['description'] : '');
$caption = $description . ' Video can be watched on the article\'s page';
$image = $image_module['image'];
$image_alt = $image['altText'];
$image_src = $image['src'];
}
$image_caption = $caption . ' ' . $image_credit
. '. Notes: Some image may have copyrighted on it.';
$wrapper = <<<EOD
<figure>
<img src="{$image_src}" alt="{$image_alt}">
<figcaption>$image_caption</figcaption>
</figure>
EOD;
return $wrapper;
}
private function getFullArticle($uri) {
$html = getSimpleHTMLDOMCached($uri)
$html = getContents($uri)
or returnServerError('Could not load ' . $uri);
$html = defaultLinkTo($html, $uri);
$scriptRegex = '/window\[\'__natgeo__\'\]=(.*);<\/script>/';
$content = '';
preg_match($scriptRegex, $html, $matches, PREG_OFFSET_CAPTURE, 0);
foreach($html->find('
.content > .smartbody.text,
.content > .section.image script[type="text/json"],
.content > .section.image span[itemprop="caption"],
.content > .section.inline script[type="text/json"]
') as $element) {
if ($element->tag === 'script') {
$json = json_decode($element->innertext, true);
if (isset($json['src'])) {
$content .= '<img src="' . $json['src'] . '" width="100%" alt="' . $json['alt'] . '">';
} elseif (isset($json['galleryType']) && isset($json['endpoint'])) {
$doc = getContents($json['endpoint'])
or returnServerError('Could not load ' . $json['endpoint']);
$json = json_decode($doc, true);
foreach($json['items'] as $item) {
$content .= '<p>' . $item['caption'] . '</p>';
$content .= '<img src="' . $item['url'] . '" width="100%" alt="' . $item['caption'] . '">';
}
}
$json = json_decode($matches[1][0], true);
$unfiltered_data = $json['page']['content']['article']['frms'];
$filtered_data = $this->filterArticleData($unfiltered_data);
$article = $filtered_data['edgs'][0];
$contributors = $article['cntrbGrp'];
$authors = array();
if(count($contributors) > 0) {
$authors = $contributors[0]['contributors'];
}
$authors_name = '';
$counter = 0;
foreach($authors as $author) {
$counter++;
if($counter == count($authors)) {
$authors_name .= $author['displayName'];
} else {
$content .= $element->outertext;
$authors_name .= $author['displayName'] . ', ';
}
}
return $content;
$published_date = $article['pbDt'];
$article_body = $article['bdy'];
$content = '';
foreach($article_body as $body) {
switch($body['type']) {
case 'p':
$content .= '<p>' . $body['cntnt']['mrkup'] . '</p>';
break;
case 'h2':
$content .= '<h2>' . $body['cntnt']['mrkup'] . '</h2>';
break;
case 'inline':
$module = $body['cntnt'];
if(empty($module))
continue 2;
switch($module['cmsType']) {
case 'image':
$content .= $this->handleImages($module, $module['cmsType']);
break;
case 'imagegroup':
$images = $module['images'];
foreach($images as $image) {
$content .= $this->handleImages($image, $module['cmsType']);
}
break;
case 'editorsNote':
$content .= $module['note'];
break;
case 'listicle':
$content .= '<h2>' . $module['title'] . '</h2>';
if(isset($module['image'])) {
$content .= $this->handleImages($module['image'], $module['image']['cmsType']);
}
$content .= '<p>' . (isset($module['text']) ? $module['text'] : '') . '</p>';
break;
case 'photogallery':
$gallery = $body['cntnt']['media'];
foreach($gallery as $image) {
$content .= $this->handleImages($image, $module['cmsType']);
}
break;
case 'video':
$content .= $this->handleImages($module, $module['cmsType']);
break;
case 'pullquote';
$quote = $module['quote'];
$author_name = '';
$authors = (isset($module['byLineProps']['authors']) ? $module['byLineProps']['authors'] : array());
foreach($authors as $author) {
$author_desc = (isset($author['authorDesc']) ? $author['authorDesc'] : '');
$author_name .= $author['displayName'] . ', ' . $author_desc;
}
$content .= <<<EOD
<figure>
<blockquote>
<p>$quote</p>
</blockquote>
<figcaption>$author_name</figcaption>
</figure>
EOD;
break;
}
break;
case 'ul':
$content .= $body['cntnt']['mrkup'] . '<hr>';
break;
}
}
return array(
'content' => $content,
'published_date' => $published_date,
'authors' => $authors_name
);
}
}