From 212c56fde580887d84f0030a0910f4d884ffd1fe Mon Sep 17 00:00:00 2001 From: Paul Prechtel Date: Thu, 20 Apr 2023 23:02:08 +0200 Subject: [PATCH] [HeiseBridge] Handle heise+ articles better (#3358) - Stop parsing paywalled heise+ articles, as they had garbage content and anyways not the full article. - Link to archive.today to access the full article without account. (Automatically getting the full article from archive.ph was not feasible b/c of captchas and problems extracting the actual content) --- bridges/HeiseBridge.php | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/bridges/HeiseBridge.php b/bridges/HeiseBridge.php index d89710f9..0e57ee85 100644 --- a/bridges/HeiseBridge.php +++ b/bridges/HeiseBridge.php @@ -118,12 +118,22 @@ class HeiseBridge extends FeedExpander protected function parseItem($feedItem) { $item = parent::parseItem($feedItem); - $item['uri'] = explode('?', $item['uri'])[0] . '?seite=all'; + // strip rss parameter + $item['uri'] = explode('?', $item['uri'])[0]; + + // ignore TechStage articles if (strpos($item['uri'], 'https://www.heise.de') !== 0) { return $item; } + // abort on heise+ articles and link to archive.ph for full-text content + if (str_starts_with($item['title'], 'heise+ |')) { + $item['uri'] = 'https://archive.ph/?run=1&url=' . urlencode($item['uri']); + return $item; + } + + $item['uri'] .= '?seite=all'; $article = getSimpleHTMLDOMCached($item['uri']); if ($article) {