[HeiseBridge] Handle heise+ articles better (#3358)

- Stop parsing paywalled heise+ articles, as they had garbage content
  and anyways not the full article.
- Link to archive.today to access the full article without account.
  (Automatically getting the full article from archive.ph was not feasible
  b/c of captchas and problems extracting the actual content)
This commit is contained in:
Paul Prechtel 2023-04-20 23:02:08 +02:00 committed by GitHub
parent 00e716d84d
commit 212c56fde5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 11 additions and 1 deletions

View File

@ -118,12 +118,22 @@ class HeiseBridge extends FeedExpander
protected function parseItem($feedItem)
{
$item = parent::parseItem($feedItem);
$item['uri'] = explode('?', $item['uri'])[0] . '?seite=all';
// strip rss parameter
$item['uri'] = explode('?', $item['uri'])[0];
// ignore TechStage articles
if (strpos($item['uri'], 'https://www.heise.de') !== 0) {
return $item;
}
// abort on heise+ articles and link to archive.ph for full-text content
if (str_starts_with($item['title'], 'heise+ |')) {
$item['uri'] = 'https://archive.ph/?run=1&url=' . urlencode($item['uri']);
return $item;
}
$item['uri'] .= '?seite=all';
$article = getSimpleHTMLDOMCached($item['uri']);
if ($article) {