diff --git a/bridges/CssSelectorFeedExpanderBridge.php b/bridges/CssSelectorFeedExpanderBridge.php index 9f332fb9..49bbd473 100644 --- a/bridges/CssSelectorFeedExpanderBridge.php +++ b/bridges/CssSelectorFeedExpanderBridge.php @@ -50,7 +50,9 @@ class CssSelectorFeedExpanderBridge extends CssSelectorBridge $discard_thumbnail = $this->getInput('discard_thumbnail'); $limit = $this->getInput('limit'); - $source_feed = (new FeedParser())->parseFeed(getContents($url)); + $feedParser = new FeedParser(); + $xml = getContents($url); + $source_feed = $feedParser->parseFeed($xml); $items = $source_feed['items']; // Map Homepage URL (Default: Root page) diff --git a/bridges/NyaaTorrentsBridge.php b/bridges/NyaaTorrentsBridge.php index f7eea07f..fcf2b197 100644 --- a/bridges/NyaaTorrentsBridge.php +++ b/bridges/NyaaTorrentsBridge.php @@ -62,52 +62,27 @@ class NyaaTorrentsBridge extends BridgeAbstract public function collectData() { - // Manually parsing because we need to acccess the nyaa namespace in the xml - $xml = simplexml_load_string(getContents($this->getURI())); - $channel = $xml->channel[0]; - $feed = []; - $feed['title'] = trim((string)$channel->title); - $feed['uri'] = trim((string)$channel->link); - if (!empty($channel->image)) { - $feed['icon'] = trim((string)$channel->image->url); - } - $items = $xml->channel[0]->item; - foreach ($items as $feedItem) { - $item = [ - 'title' => (string) $feedItem->title, - 'uri' => (string) $feedItem->link, - ]; - + $feedParser = new FeedParser(); + $feed = $feedParser->parseFeed(getContents($this->getURI())); + foreach ($feed['items'] as $item) { $item['id'] = str_replace(['https://nyaa.si/download/', '.torrent'], '', $item['uri']); - - $nyaaNamespace = (array)($feedItem->children('nyaa', true)); - $item = array_merge($item, $nyaaNamespace); - - // Convert URI from torrent file to web page $item['uri'] = str_replace('/download/', '/view/', $item['uri']); $item['uri'] = str_replace('.torrent', '', $item['uri']); - - $item_html = getSimpleHTMLDOMCached($item['uri']); - if ($item_html) { - // Retrieve full description from page contents - $item_desc = str_get_html( - markdownToHtml(html_entity_decode($item_html->find('#torrent-description', 0)->innertext)) - ); - - // Retrieve image for thumbnail or generic logo fallback + $dom = getSimpleHTMLDOMCached($item['uri']); + if ($dom) { + $description = $dom->find('#torrent-description', 0)->innertext ?? ''; + $itemDom = str_get_html(markdownToHtml(html_entity_decode($description))); $item_image = $this->getURI() . 'static/img/avatar/default.png'; - foreach ($item_desc->find('img') as $img) { + foreach ($itemDom->find('img') as $img) { if (strpos($img->src, 'prez') === false) { $item_image = $img->src; break; } } - $item['enclosures'] = [$item_image]; - $item['content'] = $item_desc; + $item['content'] = (string) $itemDom; } - $this->items[] = $item; if (count($this->items) >= 10) { break; diff --git a/composer.json b/composer.json index 31e31d74..0e7abb84 100644 --- a/composer.json +++ b/composer.json @@ -28,6 +28,7 @@ "ext-openssl": "*", "ext-libxml": "*", "ext-simplexml": "*", + "ext-dom": "*", "ext-json": "*" }, "require-dev": { @@ -38,8 +39,7 @@ "ext-memcached": "Allows to use memcached as cache type", "ext-sqlite3": "Allows to use an SQLite database for caching", "ext-zip": "Required for FDroidRepoBridge", - "ext-intl": "Required for OLXBridge", - "ext-dom": "Allows to use some bridges based on XPath expressions" + "ext-intl": "Required for OLXBridge" }, "autoload-dev": { "psr-4": { diff --git a/formats/AtomFormat.php b/formats/AtomFormat.php index 9886e4b7..d59e42fe 100644 --- a/formats/AtomFormat.php +++ b/formats/AtomFormat.php @@ -16,6 +16,8 @@ class AtomFormat extends FormatAbstract public function stringify() { + $document = new \DomDocument('1.0', $this->getCharset()); + $feedUrl = get_current_url(); $extraInfos = $this->getExtraInfos(); @@ -25,7 +27,6 @@ class AtomFormat extends FormatAbstract $uri = $extraInfos['uri']; } - $document = new \DomDocument('1.0', $this->getCharset()); $document->formatOutput = true; $feed = $document->createElementNS(self::ATOM_NS, 'feed'); $document->appendChild($feed); @@ -81,6 +82,7 @@ class AtomFormat extends FormatAbstract $linkSelf->setAttribute('href', $feedUrl); foreach ($this->getItems() as $item) { + $itemArray = $item->toArray(); $entryTimestamp = $item->getTimestamp(); $entryTitle = $item->getTitle(); $entryContent = $item->getContent(); @@ -138,7 +140,19 @@ class AtomFormat extends FormatAbstract $entry->appendChild($id); $id->appendChild($document->createTextNode($entryID)); - if (!empty($entryUri)) { + if (isset($itemArray['itunes'])) { + $feed->setAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:itunes', self::ITUNES_NS); + foreach ($itemArray['itunes'] as $itunesKey => $itunesValue) { + $itunesProperty = $document->createElementNS(self::ITUNES_NS, $itunesKey); + $entry->appendChild($itunesProperty); + $itunesProperty->appendChild($document->createTextNode($itunesValue)); + } + $itunesEnclosure = $document->createElement('enclosure'); + $entry->appendChild($itunesEnclosure); + $itunesEnclosure->setAttribute('url', $itemArray['enclosure']['url']); + $itunesEnclosure->setAttribute('length', $itemArray['enclosure']['length']); + $itunesEnclosure->setAttribute('type', $itemArray['enclosure']['type']); + } elseif (!empty($entryUri)) { $entryLinkAlternate = $document->createElement('link'); $entry->appendChild($entryLinkAlternate); $entryLinkAlternate->setAttribute('rel', 'alternate'); diff --git a/formats/MrssFormat.php b/formats/MrssFormat.php index 984611c7..4fd06439 100644 --- a/formats/MrssFormat.php +++ b/formats/MrssFormat.php @@ -34,6 +34,8 @@ class MrssFormat extends FormatAbstract public function stringify() { + $document = new \DomDocument('1.0', $this->getCharset()); + $feedUrl = get_current_url(); $extraInfos = $this->getExtraInfos(); if (empty($extraInfos['uri'])) { @@ -42,7 +44,6 @@ class MrssFormat extends FormatAbstract $uri = $extraInfos['uri']; } - $document = new \DomDocument('1.0', $this->getCharset()); $document->formatOutput = true; $feed = $document->createElement('rss'); $document->appendChild($feed); @@ -99,22 +100,23 @@ class MrssFormat extends FormatAbstract $linkSelf->setAttribute('href', $feedUrl); foreach ($this->getItems() as $item) { + $itemArray = $item->toArray(); $itemTimestamp = $item->getTimestamp(); $itemTitle = $item->getTitle(); $itemUri = $item->getURI(); $itemContent = $item->getContent() ? break_annoying_html_tags($item->getContent()) : ''; - $entryID = $item->getUid(); + $itemUid = $item->getUid(); $isPermaLink = 'false'; - if (empty($entryID) && !empty($itemUri)) { + if (empty($itemUid) && !empty($itemUri)) { // Fallback to provided URI - $entryID = $itemUri; + $itemUid = $itemUri; $isPermaLink = 'true'; } - if (empty($entryID)) { + if (empty($itemUid)) { // Fallback to title and content - $entryID = hash('sha1', $itemTitle . $itemContent); + $itemUid = hash('sha1', $itemTitle . $itemContent); } $entry = $document->createElement('item'); @@ -126,7 +128,19 @@ class MrssFormat extends FormatAbstract $entryTitle->appendChild($document->createTextNode($itemTitle)); } - if (!empty($itemUri)) { + if (isset($itemArray['itunes'])) { + $feed->setAttributeNS('http://www.w3.org/2000/xmlns/', 'xmlns:itunes', self::ITUNES_NS); + foreach ($itemArray['itunes'] as $itunesKey => $itunesValue) { + $itunesProperty = $document->createElementNS(self::ITUNES_NS, $itunesKey); + $entry->appendChild($itunesProperty); + $itunesProperty->appendChild($document->createTextNode($itunesValue)); + } + $itunesEnclosure = $document->createElement('enclosure'); + $entry->appendChild($itunesEnclosure); + $itunesEnclosure->setAttribute('url', $itemArray['enclosure']['url']); + $itunesEnclosure->setAttribute('length', $itemArray['enclosure']['length']); + $itunesEnclosure->setAttribute('type', $itemArray['enclosure']['type']); + } if (!empty($itemUri)) { $entryLink = $document->createElement('link'); $entry->appendChild($entryLink); $entryLink->appendChild($document->createTextNode($itemUri)); @@ -135,7 +149,7 @@ class MrssFormat extends FormatAbstract $entryGuid = $document->createElement('guid'); $entryGuid->setAttribute('isPermaLink', $isPermaLink); $entry->appendChild($entryGuid); - $entryGuid->appendChild($document->createTextNode($entryID)); + $entryGuid->appendChild($document->createTextNode($itemUid)); if (!empty($itemTimestamp)) { $entryPublished = $document->createElement('pubDate'); diff --git a/lib/FeedParser.php b/lib/FeedParser.php index 1393f5f5..2d982de1 100644 --- a/lib/FeedParser.php +++ b/lib/FeedParser.php @@ -3,11 +3,13 @@ declare(strict_types=1); /** - * Very basic and naive feed parser that srapes out rss 0.91, 1.0, 2.0 and atom 1.0. + * Very basic and naive feed parser. * - * Emit arrays meant to be used inside rss-bridge. + * Scrapes out rss 0.91, 1.0, 2.0 and atom 1.0. * - * The feed item structure is identical to that of FeedItem + * Produce arrays meant to be used inside rss-bridge. + * + * The item structure is tweaked so that works with FeedItem */ final class FeedParser { @@ -85,9 +87,7 @@ final class FeedParser public function parseAtomItem(\SimpleXMLElement $feedItem): array { - // Some ATOM entries also contain RSS 2.0 fields $item = $this->parseRss2Item($feedItem); - if (isset($feedItem->id)) { $item['uri'] = (string)$feedItem->id; } @@ -131,8 +131,35 @@ final class FeedParser public function parseRss2Item(\SimpleXMLElement $feedItem): array { - // Primary data is compatible to 0.91 with some additional data - $item = $this->parseRss091Item($feedItem); + $item = [ + 'uri' => '', + 'title' => '', + 'content' => '', + 'timestamp' => '', + 'author' => '', + //'uid' => null, + //'categories' => [], + //'enclosures' => [], + ]; + + foreach ($feedItem as $k => $v) { + $hasChildren = count($v) !== 0; + if (!$hasChildren) { + $item[$k] = (string) $v; + } + } + + if (isset($feedItem->link)) { + // todo: trim uri + $item['uri'] = (string)$feedItem->link; + } + if (isset($feedItem->title)) { + $item['title'] = html_entity_decode((string)$feedItem->title); + } + if (isset($feedItem->description)) { + $item['content'] = (string)$feedItem->description; + } + $namespaces = $feedItem->getNamespaces(true); if (isset($namespaces['dc'])) { $dc = $feedItem->children($namespaces['dc']); @@ -140,7 +167,24 @@ final class FeedParser if (isset($namespaces['media'])) { $media = $feedItem->children($namespaces['media']); } - + foreach ($namespaces as $namespaceName => $namespaceUrl) { + if (in_array($namespaceName, ['', 'content', 'media'])) { + continue; + } + $module = $feedItem->children($namespaceUrl); + $item[$namespaceName] = []; + foreach ($module as $moduleKey => $moduleValue) { + $item[$namespaceName][$moduleKey] = (string) $moduleValue; + } + } + if (isset($namespaces['itunes'])) { + $enclosure = $feedItem->enclosure; + $item['enclosure'] = [ + 'url' => (string) $enclosure['url'], + 'length' => (string) $enclosure['length'], + 'type' => (string) $enclosure['type'], + ]; + } if (isset($feedItem->guid)) { // Pluck out a url from guid foreach ($feedItem->guid->attributes() as $attribute => $value) { @@ -185,8 +229,26 @@ final class FeedParser public function parseRss1Item(\SimpleXMLElement $feedItem): array { - // 1.0 adds optional elements around the 0.91 standard - $item = $this->parseRss091Item($feedItem); + $item = [ + 'uri' => '', + 'title' => '', + 'content' => '', + 'timestamp' => '', + 'author' => '', + //'uid' => null, + //'categories' => [], + //'enclosures' => [], + ]; + if (isset($feedItem->link)) { + // todo: trim uri + $item['uri'] = (string)$feedItem->link; + } + if (isset($feedItem->title)) { + $item['title'] = html_entity_decode((string)$feedItem->title); + } + if (isset($feedItem->description)) { + $item['content'] = (string)$feedItem->description; + } $namespaces = $feedItem->getNamespaces(true); if (isset($namespaces['dc'])) { $dc = $feedItem->children($namespaces['dc']); @@ -199,32 +261,4 @@ final class FeedParser } return $item; } - - public function parseRss091Item(\SimpleXMLElement $feedItem): array - { - $item = [ - 'uri' => null, - 'title' => null, - 'content' => null, - 'timestamp' => null, - 'author' => null, - //'uid' => null, - //'categories' => [], - //'enclosures' => [], - ]; - if (isset($feedItem->link)) { - // todo: trim uri - $item['uri'] = (string)$feedItem->link; - } - if (isset($feedItem->title)) { - $item['title'] = html_entity_decode((string)$feedItem->title); - } - // rss 0.91 doesn't support timestamps - // rss 0.91 doesn't support authors - // rss 0.91 doesn't support enclosures - if (isset($feedItem->description)) { - $item['content'] = (string)$feedItem->description; - } - return $item; - } } diff --git a/lib/FormatAbstract.php b/lib/FormatAbstract.php index b05a5764..c76d1e42 100644 --- a/lib/FormatAbstract.php +++ b/lib/FormatAbstract.php @@ -2,6 +2,8 @@ abstract class FormatAbstract { + public const ITUNES_NS = 'http://www.itunes.com/dtds/podcast-1.0.dtd'; + const MIME_TYPE = 'text/plain'; protected string $charset = 'UTF-8';