[CNet] Rewrite bridge (#3764) (#3770)

Bridge was broken.
Full bridge rewrite using Sitemap as source.
This commit is contained in:
ORelio 2023-10-18 19:13:33 +02:00 committed by GitHub
parent 7533ef12e3
commit 9056106c2d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 87 additions and 87 deletions

View File

@ -1,6 +1,6 @@
<?php
class CNETBridge extends BridgeAbstract
class CNETBridge extends SitemapBridge
{
const MAINTAINER = 'ORelio';
const NAME = 'CNET News';
@ -14,101 +14,101 @@ class CNETBridge extends BridgeAbstract
'type' => 'list',
'values' => [
'All articles' => '',
'Apple' => 'apple',
'Google' => 'google',
'Microsoft' => 'tags-microsoft',
'Computers' => 'topics-computers',
'Mobile' => 'topics-mobile',
'Sci-Tech' => 'topics-sci-tech',
'Security' => 'topics-security',
'Internet' => 'topics-internet',
'Tech Industry' => 'topics-tech-industry'
'Tech' => 'tech',
'Money' => 'personal-finance',
'Home' => 'home',
'Wellness' => 'health',
'Energy' => 'home/energy-and-utilities',
'Deals' => 'deals',
'Computing' => 'tech/computing',
'Mobile' => 'tech/mobile',
'Science' => 'science',
'Services' => 'tech/services-and-software'
]
]
],
'limit' => self::LIMIT
]
];
private function cleanArticle($article_html)
{
$offset_p = strpos($article_html, '<p>');
$offset_figure = strpos($article_html, '<figure');
$offset = ($offset_figure < $offset_p ? $offset_figure : $offset_p);
$article_html = substr($article_html, $offset);
$article_html = str_replace('href="/', 'href="' . self::URI, $article_html);
$article_html = str_replace(' height="0"', '', $article_html);
$article_html = str_replace('<noscript>', '', $article_html);
$article_html = str_replace('</noscript>', '', $article_html);
$article_html = StripWithDelimiters($article_html, '<a class="clickToEnlarge', '</a>');
$article_html = stripWithDelimiters($article_html, '<span class="nowPlaying', '</span>');
$article_html = stripWithDelimiters($article_html, '<span class="duration', '</span>');
$article_html = stripWithDelimiters($article_html, '<script', '</script>');
$article_html = stripWithDelimiters($article_html, '<svg', '</svg>');
return $article_html;
}
public function collectData()
{
// Retrieve and check user input
$topic = str_replace('-', '/', $this->getInput('topic'));
if (!empty($topic) && (substr_count($topic, '/') > 1 || !ctype_alpha(str_replace('/', '', $topic)))) {
returnClientError('Invalid topic: ' . $topic);
$topic = $this->getInput('topic');
$limit = $this->getInput('limit');
$limit = empty($limit) ? 10 : $limit;
$url_pattern = empty($topic) ? '' : self::URI . $topic;
$sitemap_latest = self::URI . 'sitemaps/article/' . date('Y/m') . '.xml';
$sitemap_previous = self::URI . 'sitemaps/article/' . date('Y/m', strtotime('last day of previous month')) . '.xml';
$links = array_merge(
$this->sitemapXmlToList($this->getSitemapXml($sitemap_latest, true), $url_pattern, $limit),
$this->sitemapXmlToList($this->getSitemapXml($sitemap_previous, true), $url_pattern, $limit)
);
if ($limit > 0 && count($links) > $limit) {
$links = array_slice($links, 0, $limit);
}
// Retrieve webpage
$pageUrl = self::URI . (empty($topic) ? 'news/' : $topic . '/');
$html = getSimpleHTMLDOM($pageUrl);
if (empty($links)) {
returnClientError('Failed to retrieve article list');
}
// Process articles
foreach ($html->find('div.assetBody, div.riverPost') as $element) {
if (count($this->items) >= 10) {
break;
}
foreach ($links as $article_uri) {
$article_dom = convertLazyLoading(getSimpleHTMLDOMCached($article_uri));
$title = trim($article_dom->find('h1', 0)->plaintext);
$author = $article_dom->find('span.c-assetAuthor_name', 0)->plaintext;
$headline = $article_dom->find('p.c-contentHeader_description', 0);
$content = $article_dom->find('div.c-pageArticle_content, div.single-article__content, div.article-main-body', 0);
$date = null;
$enclosure = null;
$article_title = trim($element->find('h2, h3', 0)->plaintext);
$article_uri = self::URI . substr($element->find('a', 0)->href, 1);
$article_thumbnail = $element->parent()->find('img[src]', 0)->src;
$article_timestamp = strtotime($element->find('time.assetTime, div.timeAgo', 0)->plaintext);
$article_author = trim($element->find('a[rel=author], a.name', 0)->plaintext);
$article_content = '<p><b>' . trim($element->find('p.dek', 0)->plaintext) . '</b></p>';
if (is_null($article_thumbnail)) {
$article_thumbnail = extractFromDelimiters($element->innertext, '<img src="', '"');
}
if (!empty($article_title) && !empty($article_uri) && strpos($article_uri, self::URI . 'news/') !== false) {
$article_html = getSimpleHTMLDOMCached($article_uri) or $article_html = null;
if (!is_null($article_html)) {
if (empty($article_thumbnail)) {
$article_thumbnail = $article_html->find('div.originalImage', 0);
}
if (empty($article_thumbnail)) {
$article_thumbnail = $article_html->find('span.imageContainer', 0);
}
if (is_object($article_thumbnail)) {
$article_thumbnail = $article_thumbnail->find('img', 0)->src;
}
$article_content .= trim(
$this->cleanArticle(
extractFromDelimiters(
$article_html,
'<article',
'<footer'
)
)
);
foreach ($article_dom->find('script[type=application/ld+json]') as $ldjson) {
$datePublished = extractFromDelimiters($ldjson->innertext, '"datePublished":"', '"');
if ($datePublished !== false) {
$date = strtotime($datePublished);
}
$imageObject = extractFromDelimiters($ldjson->innertext, 'ImageObject","url":"', '"');
if ($imageObject !== false) {
$enclosure = $imageObject;
}
$item = [];
$item['uri'] = $article_uri;
$item['title'] = $article_title;
$item['author'] = $article_author;
$item['timestamp'] = $article_timestamp;
$item['enclosures'] = [$article_thumbnail];
$item['content'] = $article_content;
$this->items[] = $item;
}
foreach ($content->find('div.c-shortcodeGallery') as $cleanup) {
$cleanup->outertext = '';
}
foreach ($content->find('figure') as $figure) {
$img = $figure->find('img', 0);
if ($img) {
$figure->outertext = $img->outertext;
}
}
$content = $content->innertext;
if ($enclosure) {
$content = "<div><img src=\"$enclosure\" /></div>" . $content;
}
if ($headline) {
$content = '<p><b>' . $headline->plaintext . '</b></p><br />' . $content;
}
$item = [];
$item['uri'] = $article_uri;
$item['title'] = $title;
$item['author'] = $author;
$item['content'] = $content;
if (!is_null($date)) {
$item['timestamp'] = $date;
}
if (!is_null($enclosure)) {
$item['enclosures'] = [$enclosure];
}
$this->items[] = $item;
}
}
}

View File

@ -131,7 +131,7 @@ class SitemapBridge extends CssSelectorBridge
foreach ($sitemap->find('sitemap') as $nested_sitemap) {
$url = $nested_sitemap->find('loc');
if (!empty($url)) {
$url = $url[0]->plaintext;
$url = trim($url[0]->plaintext);
if (str_ends_with(strtolower($url), '.xml')) {
$nested_sitemap_xml = $this->getSitemapXml($url, true);
$nested_sitemap_links = $this->sitemapXmlToList($nested_sitemap_xml, $url_pattern, null, true);
@ -148,8 +148,8 @@ class SitemapBridge extends CssSelectorBridge
$url = $item->find('loc');
$lastmod = $item->find('lastmod');
if (!empty($url) && !empty($lastmod)) {
$url = $url[0]->plaintext;
$lastmod = $lastmod[0]->plaintext;
$url = trim($url[0]->plaintext);
$lastmod = trim($lastmod[0]->plaintext);
$timestamp = strtotime($lastmod);
if (empty($url_pattern) || preg_match('/' . $url_pattern . '/', $url) === 1) {
$links[$url] = $timestamp;