From 8d8fe66aab916a63cd04eb129a91122c14e8c623 Mon Sep 17 00:00:00 2001 From: Mynacol Date: Wed, 21 Sep 2022 21:31:43 +0200 Subject: [PATCH] [HeiseBridge] Parser rewrite (#3054) * [HeiseBridge] Parser rewrite This rewrite is more readable and consistent than the previous one. Additionally, this removes unwanted elements, largely recommendations for other articles. Furthermore, it increases the image quality by using the original picture link instead of the compressed ones. * [HeiseBridge] Formatting --- bridges/HeiseBridge.php | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/bridges/HeiseBridge.php b/bridges/HeiseBridge.php index 5f5092e3..6da6a3d1 100644 --- a/bridges/HeiseBridge.php +++ b/bridges/HeiseBridge.php @@ -61,24 +61,42 @@ class HeiseBridge extends FeedExpander private function addArticleToItem($item, $article) { - $authors = $article->find('.a-creator__names', 0)->find('.a-creator__name'); + // copy full-res img src to standard img element + foreach ($article->find('a-img') as $aimg) { + $img = $aimg->find('img', 0); + $img->src = $aimg->src; + // client scales based on aspect ratio in style attribute + $img->width = ''; + $img->height = ''; + } + // relink URIs, as the previous a-img tags weren't recognized by this function + $article = defaultLinkTo($article, $item['uri']); + + // remove unwanted stuff + foreach ($article->find('figure.branding, a-ad, div.ho-text, noscript img, .opt-in__content-container') as $element) { + $element->remove(); + } + // reload html, as remove() is buggy + $article = str_get_html($article->outertext); + + $header = $article->find('header.a-article-header', 0); + $headerElements = $header->find('p, a-img img, figure img'); + $item['content'] = implode('', $headerElements); + + $authors = $header->find('.a-creator__names .a-creator__name'); if ($authors) { $item['author'] = implode(', ', array_map(function ($e) { return $e->plaintext; }, $authors)); } - $content = $article->find('div[class*="article-content"]', 0); + $content = $article->find('.article-content', 0); + $contentElements = $content->find( + 'p, h3, ul, table, pre, a-img img, a-bilderstrecke h2, a-bilderstrecke figure, a-bilderstrecke figcaption' + ); + $item['content'] .= implode('', $contentElements); - if ($content == null) { - $content = $article->find('#article_content', 0); - } - - foreach ($content->find('p, h3, ul, table, pre, img') as $element) { - $item['content'] .= $element; - } - - foreach ($content->find('img') as $img) { + foreach ($article->find('a-img img, a-bilderstrecke img, figure img') as $img) { $item['enclosures'][] = $img->src; }