feat: preserve and reproduce podcast feeds (itunes rss module) (#3759)

This commit is contained in:
Dag 2023-10-16 02:58:03 +02:00 committed by GitHub
parent 408c2e5e91
commit ef5bd83bd0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 126 additions and 85 deletions

View File

@ -50,7 +50,9 @@ class CssSelectorFeedExpanderBridge extends CssSelectorBridge
$discard_thumbnail = $this->getInput('discard_thumbnail');
$limit = $this->getInput('limit');
$source_feed = (new FeedParser())->parseFeed(getContents($url));
$feedParser = new FeedParser();
$xml = getContents($url);
$source_feed = $feedParser->parseFeed($xml);
$items = $source_feed['items'];
// Map Homepage URL (Default: Root page)

View File

@ -62,52 +62,27 @@ class NyaaTorrentsBridge extends BridgeAbstract
public function collectData()
{
// Manually parsing because we need to acccess the nyaa namespace in the xml
$xml = simplexml_load_string(getContents($this->getURI()));
$channel = $xml->channel[0];
$feed = [];
$feed['title'] = trim((string)$channel->title);
$feed['uri'] = trim((string)$channel->link);
if (!empty($channel->image)) {
$feed['icon'] = trim((string)$channel->image->url);
}
$items = $xml->channel[0]->item;
foreach ($items as $feedItem) {
$item = [
'title' => (string) $feedItem->title,
'uri' => (string) $feedItem->link,
];
$feedParser = new FeedParser();
$feed = $feedParser->parseFeed(getContents($this->getURI()));
foreach ($feed['items'] as $item) {
$item['id'] = str_replace(['https://nyaa.si/download/', '.torrent'], '', $item['uri']);
$nyaaNamespace = (array)($feedItem->children('nyaa', true));
$item = array_merge($item, $nyaaNamespace);
// Convert URI from torrent file to web page
$item['uri'] = str_replace('/download/', '/view/', $item['uri']);
$item['uri'] = str_replace('.torrent', '', $item['uri']);
$item_html = getSimpleHTMLDOMCached($item['uri']);
if ($item_html) {
// Retrieve full description from page contents
$item_desc = str_get_html(
markdownToHtml(html_entity_decode($item_html->find('#torrent-description', 0)->innertext))
);
// Retrieve image for thumbnail or generic logo fallback
$dom = getSimpleHTMLDOMCached($item['uri']);
if ($dom) {
$description = $dom->find('#torrent-description', 0)->innertext ?? '';
$itemDom = str_get_html(markdownToHtml(html_entity_decode($description)));
$item_image = $this->getURI() . 'static/img/avatar/default.png';
foreach ($item_desc->find('img') as $img) {
foreach ($itemDom->find('img') as $img) {
if (strpos($img->src, 'prez') === false) {
$item_image = $img->src;
break;
}
}
$item['enclosures'] = [$item_image];
$item['content'] = $item_desc;
$item['content'] = (string) $itemDom;
}
$this->items[] = $item;
if (count($this->items) >= 10) {
break;

View File

@ -28,6 +28,7 @@
"ext-openssl": "*",
"ext-libxml": "*",
"ext-simplexml": "*",
"ext-dom": "*",
"ext-json": "*"
},
"require-dev": {
@ -38,8 +39,7 @@
"ext-memcached": "Allows to use memcached as cache type",
"ext-sqlite3": "Allows to use an SQLite database for caching",
"ext-zip": "Required for FDroidRepoBridge",
"ext-intl": "Required for OLXBridge",
"ext-dom": "Allows to use some bridges based on XPath expressions"
"ext-intl": "Required for OLXBridge"
},
"autoload-dev": {
"psr-4": {

View File

@ -16,6 +16,8 @@ class AtomFormat extends FormatAbstract
public function stringify()
{
$document = new \DomDocument('1.0', $this->getCharset());
$feedUrl = get_current_url();
$extraInfos = $this->getExtraInfos();
@ -25,7 +27,6 @@ class AtomFormat extends FormatAbstract
$uri = $extraInfos['uri'];
}
$document = new \DomDocument('1.0', $this->getCharset());
$document->formatOutput = true;
$feed = $document->createElementNS(self::ATOM_NS, 'feed');
$document->appendChild($feed);
@ -81,6 +82,7 @@ class AtomFormat extends FormatAbstract
$linkSelf->setAttribute('href', $feedUrl);
foreach ($this->getItems() as $item) {
$itemArray = $item->toArray();
$entryTimestamp = $item->getTimestamp();
$entryTitle = $item->getTitle();
$entryContent = $item->getContent();
@ -138,7 +140,19 @@ class AtomFormat extends FormatAbstract
$entry->appendChild($id);
$id->appendChild($document->createTextNode($entryID));
if (!empty($entryUri)) {
if (isset($itemArray['itunes'])) {
$feed->setAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:itunes', self::ITUNES_NS);
foreach ($itemArray['itunes'] as $itunesKey => $itunesValue) {
$itunesProperty = $document->createElementNS(self::ITUNES_NS, $itunesKey);
$entry->appendChild($itunesProperty);
$itunesProperty->appendChild($document->createTextNode($itunesValue));
}
$itunesEnclosure = $document->createElement('enclosure');
$entry->appendChild($itunesEnclosure);
$itunesEnclosure->setAttribute('url', $itemArray['enclosure']['url']);
$itunesEnclosure->setAttribute('length', $itemArray['enclosure']['length']);
$itunesEnclosure->setAttribute('type', $itemArray['enclosure']['type']);
} elseif (!empty($entryUri)) {
$entryLinkAlternate = $document->createElement('link');
$entry->appendChild($entryLinkAlternate);
$entryLinkAlternate->setAttribute('rel', 'alternate');

View File

@ -34,6 +34,8 @@ class MrssFormat extends FormatAbstract
public function stringify()
{
$document = new \DomDocument('1.0', $this->getCharset());
$feedUrl = get_current_url();
$extraInfos = $this->getExtraInfos();
if (empty($extraInfos['uri'])) {
@ -42,7 +44,6 @@ class MrssFormat extends FormatAbstract
$uri = $extraInfos['uri'];
}
$document = new \DomDocument('1.0', $this->getCharset());
$document->formatOutput = true;
$feed = $document->createElement('rss');
$document->appendChild($feed);
@ -99,22 +100,23 @@ class MrssFormat extends FormatAbstract
$linkSelf->setAttribute('href', $feedUrl);
foreach ($this->getItems() as $item) {
$itemArray = $item->toArray();
$itemTimestamp = $item->getTimestamp();
$itemTitle = $item->getTitle();
$itemUri = $item->getURI();
$itemContent = $item->getContent() ? break_annoying_html_tags($item->getContent()) : '';
$entryID = $item->getUid();
$itemUid = $item->getUid();
$isPermaLink = 'false';
if (empty($entryID) && !empty($itemUri)) {
if (empty($itemUid) && !empty($itemUri)) {
// Fallback to provided URI
$entryID = $itemUri;
$itemUid = $itemUri;
$isPermaLink = 'true';
}
if (empty($entryID)) {
if (empty($itemUid)) {
// Fallback to title and content
$entryID = hash('sha1', $itemTitle . $itemContent);
$itemUid = hash('sha1', $itemTitle . $itemContent);
}
$entry = $document->createElement('item');
@ -126,7 +128,19 @@ class MrssFormat extends FormatAbstract
$entryTitle->appendChild($document->createTextNode($itemTitle));
}
if (!empty($itemUri)) {
if (isset($itemArray['itunes'])) {
$feed->setAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:itunes', self::ITUNES_NS);
foreach ($itemArray['itunes'] as $itunesKey => $itunesValue) {
$itunesProperty = $document->createElementNS(self::ITUNES_NS, $itunesKey);
$entry->appendChild($itunesProperty);
$itunesProperty->appendChild($document->createTextNode($itunesValue));
}
$itunesEnclosure = $document->createElement('enclosure');
$entry->appendChild($itunesEnclosure);
$itunesEnclosure->setAttribute('url', $itemArray['enclosure']['url']);
$itunesEnclosure->setAttribute('length', $itemArray['enclosure']['length']);
$itunesEnclosure->setAttribute('type', $itemArray['enclosure']['type']);
} if (!empty($itemUri)) {
$entryLink = $document->createElement('link');
$entry->appendChild($entryLink);
$entryLink->appendChild($document->createTextNode($itemUri));
@ -135,7 +149,7 @@ class MrssFormat extends FormatAbstract
$entryGuid = $document->createElement('guid');
$entryGuid->setAttribute('isPermaLink', $isPermaLink);
$entry->appendChild($entryGuid);
$entryGuid->appendChild($document->createTextNode($entryID));
$entryGuid->appendChild($document->createTextNode($itemUid));
if (!empty($itemTimestamp)) {
$entryPublished = $document->createElement('pubDate');

View File

@ -3,11 +3,13 @@
declare(strict_types=1);
/**
* Very basic and naive feed parser that srapes out rss 0.91, 1.0, 2.0 and atom 1.0.
* Very basic and naive feed parser.
*
* Emit arrays meant to be used inside rss-bridge.
* Scrapes out rss 0.91, 1.0, 2.0 and atom 1.0.
*
* The feed item structure is identical to that of FeedItem
* Produce arrays meant to be used inside rss-bridge.
*
* The item structure is tweaked so that works with FeedItem
*/
final class FeedParser
{
@ -85,9 +87,7 @@ final class FeedParser
public function parseAtomItem(\SimpleXMLElement $feedItem): array
{
// Some ATOM entries also contain RSS 2.0 fields
$item = $this->parseRss2Item($feedItem);
if (isset($feedItem->id)) {
$item['uri'] = (string)$feedItem->id;
}
@ -131,8 +131,35 @@ final class FeedParser
public function parseRss2Item(\SimpleXMLElement $feedItem): array
{
// Primary data is compatible to 0.91 with some additional data
$item = $this->parseRss091Item($feedItem);
$item = [
'uri' => '',
'title' => '',
'content' => '',
'timestamp' => '',
'author' => '',
//'uid' => null,
//'categories' => [],
//'enclosures' => [],
];
foreach ($feedItem as $k => $v) {
$hasChildren = count($v) !== 0;
if (!$hasChildren) {
$item[$k] = (string) $v;
}
}
if (isset($feedItem->link)) {
// todo: trim uri
$item['uri'] = (string)$feedItem->link;
}
if (isset($feedItem->title)) {
$item['title'] = html_entity_decode((string)$feedItem->title);
}
if (isset($feedItem->description)) {
$item['content'] = (string)$feedItem->description;
}
$namespaces = $feedItem->getNamespaces(true);
if (isset($namespaces['dc'])) {
$dc = $feedItem->children($namespaces['dc']);
@ -140,7 +167,24 @@ final class FeedParser
if (isset($namespaces['media'])) {
$media = $feedItem->children($namespaces['media']);
}
foreach ($namespaces as $namespaceName => $namespaceUrl) {
if (in_array($namespaceName, ['', 'content', 'media'])) {
continue;
}
$module = $feedItem->children($namespaceUrl);
$item[$namespaceName] = [];
foreach ($module as $moduleKey => $moduleValue) {
$item[$namespaceName][$moduleKey] = (string) $moduleValue;
}
}
if (isset($namespaces['itunes'])) {
$enclosure = $feedItem->enclosure;
$item['enclosure'] = [
'url' => (string) $enclosure['url'],
'length' => (string) $enclosure['length'],
'type' => (string) $enclosure['type'],
];
}
if (isset($feedItem->guid)) {
// Pluck out a url from guid
foreach ($feedItem->guid->attributes() as $attribute => $value) {
@ -185,8 +229,26 @@ final class FeedParser
public function parseRss1Item(\SimpleXMLElement $feedItem): array
{
// 1.0 adds optional elements around the 0.91 standard
$item = $this->parseRss091Item($feedItem);
$item = [
'uri' => '',
'title' => '',
'content' => '',
'timestamp' => '',
'author' => '',
//'uid' => null,
//'categories' => [],
//'enclosures' => [],
];
if (isset($feedItem->link)) {
// todo: trim uri
$item['uri'] = (string)$feedItem->link;
}
if (isset($feedItem->title)) {
$item['title'] = html_entity_decode((string)$feedItem->title);
}
if (isset($feedItem->description)) {
$item['content'] = (string)$feedItem->description;
}
$namespaces = $feedItem->getNamespaces(true);
if (isset($namespaces['dc'])) {
$dc = $feedItem->children($namespaces['dc']);
@ -199,32 +261,4 @@ final class FeedParser
}
return $item;
}
public function parseRss091Item(\SimpleXMLElement $feedItem): array
{
$item = [
'uri' => null,
'title' => null,
'content' => null,
'timestamp' => null,
'author' => null,
//'uid' => null,
//'categories' => [],
//'enclosures' => [],
];
if (isset($feedItem->link)) {
// todo: trim uri
$item['uri'] = (string)$feedItem->link;
}
if (isset($feedItem->title)) {
$item['title'] = html_entity_decode((string)$feedItem->title);
}
// rss 0.91 doesn't support timestamps
// rss 0.91 doesn't support authors
// rss 0.91 doesn't support enclosures
if (isset($feedItem->description)) {
$item['content'] = (string)$feedItem->description;
}
return $item;
}
}

View File

@ -2,6 +2,8 @@
abstract class FormatAbstract
{
public const ITUNES_NS = 'http://www.itunes.com/dtds/podcast-1.0.dtd';
const MIME_TYPE = 'text/plain';
protected string $charset = 'UTF-8';