feat: preserve and reproduce podcast feeds (itunes rss module) (#3759)

This commit is contained in:
Dag 2023-10-16 02:58:03 +02:00 committed by GitHub
parent 408c2e5e91
commit ef5bd83bd0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 126 additions and 85 deletions

View File

@ -50,7 +50,9 @@ class CssSelectorFeedExpanderBridge extends CssSelectorBridge
$discard_thumbnail = $this->getInput('discard_thumbnail'); $discard_thumbnail = $this->getInput('discard_thumbnail');
$limit = $this->getInput('limit'); $limit = $this->getInput('limit');
$source_feed = (new FeedParser())->parseFeed(getContents($url)); $feedParser = new FeedParser();
$xml = getContents($url);
$source_feed = $feedParser->parseFeed($xml);
$items = $source_feed['items']; $items = $source_feed['items'];
// Map Homepage URL (Default: Root page) // Map Homepage URL (Default: Root page)

View File

@ -62,52 +62,27 @@ class NyaaTorrentsBridge extends BridgeAbstract
public function collectData() public function collectData()
{ {
// Manually parsing because we need to acccess the nyaa namespace in the xml $feedParser = new FeedParser();
$xml = simplexml_load_string(getContents($this->getURI())); $feed = $feedParser->parseFeed(getContents($this->getURI()));
$channel = $xml->channel[0];
$feed = [];
$feed['title'] = trim((string)$channel->title);
$feed['uri'] = trim((string)$channel->link);
if (!empty($channel->image)) {
$feed['icon'] = trim((string)$channel->image->url);
}
$items = $xml->channel[0]->item;
foreach ($items as $feedItem) {
$item = [
'title' => (string) $feedItem->title,
'uri' => (string) $feedItem->link,
];
foreach ($feed['items'] as $item) {
$item['id'] = str_replace(['https://nyaa.si/download/', '.torrent'], '', $item['uri']); $item['id'] = str_replace(['https://nyaa.si/download/', '.torrent'], '', $item['uri']);
$nyaaNamespace = (array)($feedItem->children('nyaa', true));
$item = array_merge($item, $nyaaNamespace);
// Convert URI from torrent file to web page
$item['uri'] = str_replace('/download/', '/view/', $item['uri']); $item['uri'] = str_replace('/download/', '/view/', $item['uri']);
$item['uri'] = str_replace('.torrent', '', $item['uri']); $item['uri'] = str_replace('.torrent', '', $item['uri']);
$dom = getSimpleHTMLDOMCached($item['uri']);
$item_html = getSimpleHTMLDOMCached($item['uri']); if ($dom) {
if ($item_html) { $description = $dom->find('#torrent-description', 0)->innertext ?? '';
// Retrieve full description from page contents $itemDom = str_get_html(markdownToHtml(html_entity_decode($description)));
$item_desc = str_get_html(
markdownToHtml(html_entity_decode($item_html->find('#torrent-description', 0)->innertext))
);
// Retrieve image for thumbnail or generic logo fallback
$item_image = $this->getURI() . 'static/img/avatar/default.png'; $item_image = $this->getURI() . 'static/img/avatar/default.png';
foreach ($item_desc->find('img') as $img) { foreach ($itemDom->find('img') as $img) {
if (strpos($img->src, 'prez') === false) { if (strpos($img->src, 'prez') === false) {
$item_image = $img->src; $item_image = $img->src;
break; break;
} }
} }
$item['enclosures'] = [$item_image]; $item['enclosures'] = [$item_image];
$item['content'] = $item_desc; $item['content'] = (string) $itemDom;
} }
$this->items[] = $item; $this->items[] = $item;
if (count($this->items) >= 10) { if (count($this->items) >= 10) {
break; break;

View File

@ -28,6 +28,7 @@
"ext-openssl": "*", "ext-openssl": "*",
"ext-libxml": "*", "ext-libxml": "*",
"ext-simplexml": "*", "ext-simplexml": "*",
"ext-dom": "*",
"ext-json": "*" "ext-json": "*"
}, },
"require-dev": { "require-dev": {
@ -38,8 +39,7 @@
"ext-memcached": "Allows to use memcached as cache type", "ext-memcached": "Allows to use memcached as cache type",
"ext-sqlite3": "Allows to use an SQLite database for caching", "ext-sqlite3": "Allows to use an SQLite database for caching",
"ext-zip": "Required for FDroidRepoBridge", "ext-zip": "Required for FDroidRepoBridge",
"ext-intl": "Required for OLXBridge", "ext-intl": "Required for OLXBridge"
"ext-dom": "Allows to use some bridges based on XPath expressions"
}, },
"autoload-dev": { "autoload-dev": {
"psr-4": { "psr-4": {

View File

@ -16,6 +16,8 @@ class AtomFormat extends FormatAbstract
public function stringify() public function stringify()
{ {
$document = new \DomDocument('1.0', $this->getCharset());
$feedUrl = get_current_url(); $feedUrl = get_current_url();
$extraInfos = $this->getExtraInfos(); $extraInfos = $this->getExtraInfos();
@ -25,7 +27,6 @@ class AtomFormat extends FormatAbstract
$uri = $extraInfos['uri']; $uri = $extraInfos['uri'];
} }
$document = new \DomDocument('1.0', $this->getCharset());
$document->formatOutput = true; $document->formatOutput = true;
$feed = $document->createElementNS(self::ATOM_NS, 'feed'); $feed = $document->createElementNS(self::ATOM_NS, 'feed');
$document->appendChild($feed); $document->appendChild($feed);
@ -81,6 +82,7 @@ class AtomFormat extends FormatAbstract
$linkSelf->setAttribute('href', $feedUrl); $linkSelf->setAttribute('href', $feedUrl);
foreach ($this->getItems() as $item) { foreach ($this->getItems() as $item) {
$itemArray = $item->toArray();
$entryTimestamp = $item->getTimestamp(); $entryTimestamp = $item->getTimestamp();
$entryTitle = $item->getTitle(); $entryTitle = $item->getTitle();
$entryContent = $item->getContent(); $entryContent = $item->getContent();
@ -138,7 +140,19 @@ class AtomFormat extends FormatAbstract
$entry->appendChild($id); $entry->appendChild($id);
$id->appendChild($document->createTextNode($entryID)); $id->appendChild($document->createTextNode($entryID));
if (!empty($entryUri)) { if (isset($itemArray['itunes'])) {
$feed->setAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:itunes', self::ITUNES_NS);
foreach ($itemArray['itunes'] as $itunesKey => $itunesValue) {
$itunesProperty = $document->createElementNS(self::ITUNES_NS, $itunesKey);
$entry->appendChild($itunesProperty);
$itunesProperty->appendChild($document->createTextNode($itunesValue));
}
$itunesEnclosure = $document->createElement('enclosure');
$entry->appendChild($itunesEnclosure);
$itunesEnclosure->setAttribute('url', $itemArray['enclosure']['url']);
$itunesEnclosure->setAttribute('length', $itemArray['enclosure']['length']);
$itunesEnclosure->setAttribute('type', $itemArray['enclosure']['type']);
} elseif (!empty($entryUri)) {
$entryLinkAlternate = $document->createElement('link'); $entryLinkAlternate = $document->createElement('link');
$entry->appendChild($entryLinkAlternate); $entry->appendChild($entryLinkAlternate);
$entryLinkAlternate->setAttribute('rel', 'alternate'); $entryLinkAlternate->setAttribute('rel', 'alternate');

View File

@ -34,6 +34,8 @@ class MrssFormat extends FormatAbstract
public function stringify() public function stringify()
{ {
$document = new \DomDocument('1.0', $this->getCharset());
$feedUrl = get_current_url(); $feedUrl = get_current_url();
$extraInfos = $this->getExtraInfos(); $extraInfos = $this->getExtraInfos();
if (empty($extraInfos['uri'])) { if (empty($extraInfos['uri'])) {
@ -42,7 +44,6 @@ class MrssFormat extends FormatAbstract
$uri = $extraInfos['uri']; $uri = $extraInfos['uri'];
} }
$document = new \DomDocument('1.0', $this->getCharset());
$document->formatOutput = true; $document->formatOutput = true;
$feed = $document->createElement('rss'); $feed = $document->createElement('rss');
$document->appendChild($feed); $document->appendChild($feed);
@ -99,22 +100,23 @@ class MrssFormat extends FormatAbstract
$linkSelf->setAttribute('href', $feedUrl); $linkSelf->setAttribute('href', $feedUrl);
foreach ($this->getItems() as $item) { foreach ($this->getItems() as $item) {
$itemArray = $item->toArray();
$itemTimestamp = $item->getTimestamp(); $itemTimestamp = $item->getTimestamp();
$itemTitle = $item->getTitle(); $itemTitle = $item->getTitle();
$itemUri = $item->getURI(); $itemUri = $item->getURI();
$itemContent = $item->getContent() ? break_annoying_html_tags($item->getContent()) : ''; $itemContent = $item->getContent() ? break_annoying_html_tags($item->getContent()) : '';
$entryID = $item->getUid(); $itemUid = $item->getUid();
$isPermaLink = 'false'; $isPermaLink = 'false';
if (empty($entryID) && !empty($itemUri)) { if (empty($itemUid) && !empty($itemUri)) {
// Fallback to provided URI // Fallback to provided URI
$entryID = $itemUri; $itemUid = $itemUri;
$isPermaLink = 'true'; $isPermaLink = 'true';
} }
if (empty($entryID)) { if (empty($itemUid)) {
// Fallback to title and content // Fallback to title and content
$entryID = hash('sha1', $itemTitle . $itemContent); $itemUid = hash('sha1', $itemTitle . $itemContent);
} }
$entry = $document->createElement('item'); $entry = $document->createElement('item');
@ -126,7 +128,19 @@ class MrssFormat extends FormatAbstract
$entryTitle->appendChild($document->createTextNode($itemTitle)); $entryTitle->appendChild($document->createTextNode($itemTitle));
} }
if (!empty($itemUri)) { if (isset($itemArray['itunes'])) {
$feed->setAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:itunes', self::ITUNES_NS);
foreach ($itemArray['itunes'] as $itunesKey => $itunesValue) {
$itunesProperty = $document->createElementNS(self::ITUNES_NS, $itunesKey);
$entry->appendChild($itunesProperty);
$itunesProperty->appendChild($document->createTextNode($itunesValue));
}
$itunesEnclosure = $document->createElement('enclosure');
$entry->appendChild($itunesEnclosure);
$itunesEnclosure->setAttribute('url', $itemArray['enclosure']['url']);
$itunesEnclosure->setAttribute('length', $itemArray['enclosure']['length']);
$itunesEnclosure->setAttribute('type', $itemArray['enclosure']['type']);
} if (!empty($itemUri)) {
$entryLink = $document->createElement('link'); $entryLink = $document->createElement('link');
$entry->appendChild($entryLink); $entry->appendChild($entryLink);
$entryLink->appendChild($document->createTextNode($itemUri)); $entryLink->appendChild($document->createTextNode($itemUri));
@ -135,7 +149,7 @@ class MrssFormat extends FormatAbstract
$entryGuid = $document->createElement('guid'); $entryGuid = $document->createElement('guid');
$entryGuid->setAttribute('isPermaLink', $isPermaLink); $entryGuid->setAttribute('isPermaLink', $isPermaLink);
$entry->appendChild($entryGuid); $entry->appendChild($entryGuid);
$entryGuid->appendChild($document->createTextNode($entryID)); $entryGuid->appendChild($document->createTextNode($itemUid));
if (!empty($itemTimestamp)) { if (!empty($itemTimestamp)) {
$entryPublished = $document->createElement('pubDate'); $entryPublished = $document->createElement('pubDate');

View File

@ -3,11 +3,13 @@
declare(strict_types=1); declare(strict_types=1);
/** /**
* Very basic and naive feed parser that srapes out rss 0.91, 1.0, 2.0 and atom 1.0. * Very basic and naive feed parser.
* *
* Emit arrays meant to be used inside rss-bridge. * Scrapes out rss 0.91, 1.0, 2.0 and atom 1.0.
* *
* The feed item structure is identical to that of FeedItem * Produce arrays meant to be used inside rss-bridge.
*
* The item structure is tweaked so that works with FeedItem
*/ */
final class FeedParser final class FeedParser
{ {
@ -85,9 +87,7 @@ final class FeedParser
public function parseAtomItem(\SimpleXMLElement $feedItem): array public function parseAtomItem(\SimpleXMLElement $feedItem): array
{ {
// Some ATOM entries also contain RSS 2.0 fields
$item = $this->parseRss2Item($feedItem); $item = $this->parseRss2Item($feedItem);
if (isset($feedItem->id)) { if (isset($feedItem->id)) {
$item['uri'] = (string)$feedItem->id; $item['uri'] = (string)$feedItem->id;
} }
@ -131,8 +131,35 @@ final class FeedParser
public function parseRss2Item(\SimpleXMLElement $feedItem): array public function parseRss2Item(\SimpleXMLElement $feedItem): array
{ {
// Primary data is compatible to 0.91 with some additional data $item = [
$item = $this->parseRss091Item($feedItem); 'uri' => '',
'title' => '',
'content' => '',
'timestamp' => '',
'author' => '',
//'uid' => null,
//'categories' => [],
//'enclosures' => [],
];
foreach ($feedItem as $k => $v) {
$hasChildren = count($v) !== 0;
if (!$hasChildren) {
$item[$k] = (string) $v;
}
}
if (isset($feedItem->link)) {
// todo: trim uri
$item['uri'] = (string)$feedItem->link;
}
if (isset($feedItem->title)) {
$item['title'] = html_entity_decode((string)$feedItem->title);
}
if (isset($feedItem->description)) {
$item['content'] = (string)$feedItem->description;
}
$namespaces = $feedItem->getNamespaces(true); $namespaces = $feedItem->getNamespaces(true);
if (isset($namespaces['dc'])) { if (isset($namespaces['dc'])) {
$dc = $feedItem->children($namespaces['dc']); $dc = $feedItem->children($namespaces['dc']);
@ -140,7 +167,24 @@ final class FeedParser
if (isset($namespaces['media'])) { if (isset($namespaces['media'])) {
$media = $feedItem->children($namespaces['media']); $media = $feedItem->children($namespaces['media']);
} }
foreach ($namespaces as $namespaceName => $namespaceUrl) {
if (in_array($namespaceName, ['', 'content', 'media'])) {
continue;
}
$module = $feedItem->children($namespaceUrl);
$item[$namespaceName] = [];
foreach ($module as $moduleKey => $moduleValue) {
$item[$namespaceName][$moduleKey] = (string) $moduleValue;
}
}
if (isset($namespaces['itunes'])) {
$enclosure = $feedItem->enclosure;
$item['enclosure'] = [
'url' => (string) $enclosure['url'],
'length' => (string) $enclosure['length'],
'type' => (string) $enclosure['type'],
];
}
if (isset($feedItem->guid)) { if (isset($feedItem->guid)) {
// Pluck out a url from guid // Pluck out a url from guid
foreach ($feedItem->guid->attributes() as $attribute => $value) { foreach ($feedItem->guid->attributes() as $attribute => $value) {
@ -185,8 +229,26 @@ final class FeedParser
public function parseRss1Item(\SimpleXMLElement $feedItem): array public function parseRss1Item(\SimpleXMLElement $feedItem): array
{ {
// 1.0 adds optional elements around the 0.91 standard $item = [
$item = $this->parseRss091Item($feedItem); 'uri' => '',
'title' => '',
'content' => '',
'timestamp' => '',
'author' => '',
//'uid' => null,
//'categories' => [],
//'enclosures' => [],
];
if (isset($feedItem->link)) {
// todo: trim uri
$item['uri'] = (string)$feedItem->link;
}
if (isset($feedItem->title)) {
$item['title'] = html_entity_decode((string)$feedItem->title);
}
if (isset($feedItem->description)) {
$item['content'] = (string)$feedItem->description;
}
$namespaces = $feedItem->getNamespaces(true); $namespaces = $feedItem->getNamespaces(true);
if (isset($namespaces['dc'])) { if (isset($namespaces['dc'])) {
$dc = $feedItem->children($namespaces['dc']); $dc = $feedItem->children($namespaces['dc']);
@ -199,32 +261,4 @@ final class FeedParser
} }
return $item; return $item;
} }
public function parseRss091Item(\SimpleXMLElement $feedItem): array
{
$item = [
'uri' => null,
'title' => null,
'content' => null,
'timestamp' => null,
'author' => null,
//'uid' => null,
//'categories' => [],
//'enclosures' => [],
];
if (isset($feedItem->link)) {
// todo: trim uri
$item['uri'] = (string)$feedItem->link;
}
if (isset($feedItem->title)) {
$item['title'] = html_entity_decode((string)$feedItem->title);
}
// rss 0.91 doesn't support timestamps
// rss 0.91 doesn't support authors
// rss 0.91 doesn't support enclosures
if (isset($feedItem->description)) {
$item['content'] = (string)$feedItem->description;
}
return $item;
}
} }

View File

@ -2,6 +2,8 @@
abstract class FormatAbstract abstract class FormatAbstract
{ {
public const ITUNES_NS = 'http://www.itunes.com/dtds/podcast-1.0.dtd';
const MIME_TYPE = 'text/plain'; const MIME_TYPE = 'text/plain';
protected string $charset = 'UTF-8'; protected string $charset = 'UTF-8';