[WeLiveSecurity] Fix content extraction (#3734)

This commit is contained in:
ORelio 2023-10-10 19:34:16 +02:00 committed by GitHub
parent 47f52b5912
commit 143f90da60
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 26 additions and 9 deletions

View File

@ -16,19 +16,36 @@ class WeLiveSecurityBridge extends FeedExpander
{ {
$item = parent::parseItem($item); $item = parent::parseItem($item);
$article_html = getSimpleHTMLDOMCached($item['uri']); $html = getSimpleHTMLDOMCached($item['uri']);
if (!$article_html) { if (!$html) {
$item['content'] .= '<p><em>Could not request ' . $this->getName() . ': ' . $item['uri'] . '</em></p>'; $item['content'] .= '<br /><p><em>Could not request ' . $this->getName() . ': ' . $item['uri'] . '</em></p>';
return $item; return $item;
} }
$article_content = $article_html->find('div.formatted', 0)->innertext; $html = $html->find('.article-page', 0);
$article_content = stripWithDelimiters($article_content, '<script', '</script>'); $content_html = $html->find('.article-body', 0);
$article_content = stripRecursiveHTMLSection($article_content, 'div', '<div class="comments');
$article_content = stripRecursiveHTMLSection($article_content, 'div', '<div class="similar-articles');
$article_content = stripRecursiveHTMLSection($article_content, 'span', '<span class="meta');
$item['content'] = trim($article_content);
// Remove social media footer
foreach ($content_html->find('blockquote') as $blockquote) {
if (str_starts_with(trim($blockquote->plaintext), 'Connect with us on')) {
$blockquote->outertext = '';
}
}
// Headline subtitle
$content = $content_html->innertext;
$subtitle = $html->find('.sub-title', 0);
if ($subtitle) {
$content = '<p><b>' . $subtitle->plaintext . '</b></p>' . $content;
}
// Author
$author = $html->find('.article-author', 0);
if ($author && !isset($item['author'])) {
$item['author'] = trim($author->plaintext);
}
$item['content'] = trim($content);
return $item; return $item;
} }