[EconomistBridge] Fix for new layout (#2489)

This commit is contained in:
Bockiii 2022-03-22 20:24:07 +01:00 committed by GitHub
parent 1a3419a2d4
commit ac8e94ec56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 19 additions and 6 deletions

View File

@ -95,23 +95,36 @@ class EconomistBridge extends FeedExpander {
protected function parseItem($feedItem){
$item = parent::parseItem($feedItem);
$article = getSimpleHTMLDOM($item['uri']);
// before the article can be added, it needs to be cleaned up, thus, the extra function
$item['content'] = $this->cleanContent($article);
// We also need to distinguish between old style and new style articles
if ($article->find('article', 0)->getAttribute('data-test-id') == 'Article') {
$contentNode = 'div.layout-article-body';
$imgNode = 'div.article__lead-image';
$categoryNode = 'span.article__subheadline';
} else {
$contentNode = 'div.e4sqmp1';
$imgNode = 'figure.e3y6nua0';
$categoryNode = 'span.ern1uyf0';
}
$item['content'] = $this->cleanContent($article, $contentNode);
// only the article lead image is retained if it's there
if (!is_null($article->find('div.article__lead-image', 0))) {
$item['enclosures'][] = $article->find('div.article__lead-image', 0)->find('img', 0)->getAttribute('src');
if (!is_null($article->find($imgNode, 0))) {
$item['enclosures'][] = $article->find($imgNode, 0)->find('img', 0)->getAttribute('src');
} else {
$item['enclosures'][] = '';
}
// add the subheadline as category. This will create a link in new articles
// and a text in old articles
$item['categories'][] = $article->find($categoryNode, 0)->innertext;
return $item;
}
private function cleanContent($article){
private function cleanContent($article, $contentNode){
// the actual article is in this div
$content = $article->find('div.layout-article-body', 0)->innertext;
$content = $article->find($contentNode, 0)->innertext;
// clean the article content. Remove all div's since the text is in paragraph elements
foreach (array(
'<div '