[ArsTechnicaBridge] Properly handle paged content (#3855)

* [ArsTechnicaBridge] Properly handle paged content

* [ArsTechnicaBridge] Remove normal site ad wrapper
This commit is contained in:
July 2023-12-23 03:42:37 -05:00 committed by GitHub
parent f40f997405
commit ea2b4d7506
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 13 additions and 18 deletions

View File

@ -35,39 +35,34 @@ class ArsTechnicaBridge extends FeedExpander
protected function parseItem(array $item)
{
$item_html = getSimpleHTMLDOMCached($item['uri'] . '&amp');
$item_html = getSimpleHTMLDOMCached($item['uri']);
$item_html = defaultLinkTo($item_html, self::URI);
$item['content'] = $item_html->find('.article-content', 0);
$item_content = $item_html->find('.article-content.post-page', 0);
if (!$item_content) {
// The dom selector probably broke. Let's just return the item as-is
return $item;
$pages = $item_html->find('nav.page-numbers > .numbers > a', -2);
if (null !== $pages) {
for ($i = 2; $i <= $pages->innertext; $i++) {
$page_url = $item['uri'] . '&page=' . $i;
$page_html = getSimpleHTMLDOMCached($page_url);
$page_html = defaultLinkTo($page_html, self::URI);
$item['content'] .= $page_html->find('.article-content', 0);
}
$item['content'] = str_get_html($item['content']);
}
$item['content'] = $item_content;
// remove various ars advertising
$item['content']->find('#social-left', 0)->remove();
foreach ($item['content']->find('.ars-component-buy-box') as $ad) {
$ad->remove();
}
foreach ($item['content']->find('i-amphtml-sizer') as $ad) {
foreach ($item['content']->find('.ad_wrapper') as $ad) {
$ad->remove();
}
foreach ($item['content']->find('.sidebar') as $ad) {
$ad->remove();
}
foreach ($item['content']->find('a') as $link) { //remove amp redirect links
$url = $link->getAttribute('href');
if (str_contains($url, 'go.redirectingat.com')) {
$url = extractFromDelimiters($url, 'url=', '&amp');
$url = urldecode($url);
$link->setAttribute('href', $url);
}
}
$item['content'] = backgroundToImg(str_replace('data-amp-original-style="background-image', 'style="background-image', $item['content']));
$item['content'] = backgroundToImg($item['content']);
$item['uid'] = explode('=', $item['uri'])[1];