refactor: more feed parsing tweaks (#3748)

This commit is contained in:
Dag 2023-10-13 02:31:09 +02:00 committed by GitHub
parent 2880524dfc
commit 49d9dafaec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 7 deletions

View File

@ -40,7 +40,7 @@ class TapasBridge extends FeedExpander
$this->id = $html->find('meta[property$=":url"]', 0)->content;
$this->id = str_ireplace(['tapastic://series/', '/info'], '', $this->id);
}
$this->collectExpandableDatas($this->getURI());
$this->collectExpandableDatas($this->getURI(), 10);
}
protected function parseItem(array $item)
@ -55,9 +55,8 @@ class TapasBridge extends FeedExpander
if ($this->getInput('extend_content')) {
$html = getSimpleHTMLDOM($item['uri']);
if (!$item['content']) {
$item['content'] = '';
}
$item['content'] = $item['content'] ?? '';
if ($html->find('article.main__body', 0)) {
foreach ($html->find('article', 0)->find('img') as $line) {
$item['content'] .= '<img src="' . $line->{'data-src'} . '">';

View File

@ -22,6 +22,11 @@ abstract class FeedExpander extends BridgeAbstract
if ($xmlString === '') {
throw new \Exception(sprintf('Unable to parse xml from `%s` because we got the empty string', $url), 10);
}
// prepare/massage the xml to make it more acceptable
$badStrings = [
'&raquo;',
];
$xmlString = str_replace($badStrings, '', $xmlString);
$feedParser = new FeedParser();
$this->feed = $feedParser->parseFeed($xmlString);
$items = array_slice($this->feed['items'], 0, $maxItems);

View File

@ -11,7 +11,10 @@ final class FeedParser
$xmlErrors = libxml_get_errors();
libxml_use_internal_errors(false);
if ($xml === false) {
throw new \Exception('Unable to parse xml');
if ($xmlErrors) {
$firstXmlErrorMessage = $xmlErrors[0]->message;
}
throw new \Exception(sprintf('Unable to parse xml: %s', $firstXmlErrorMessage ?? ''));
}
$feed = [
'title' => null,
@ -123,7 +126,6 @@ final class FeedParser
{
// Primary data is compatible to 0.91 with some additional data
$item = $this->parseRss091Item($feedItem);
$namespaces = $feedItem->getNamespaces(true);
if (isset($namespaces['dc'])) {
$dc = $feedItem->children($namespaces['dc']);
@ -192,7 +194,14 @@ final class FeedParser
public function parseRss091Item(\SimpleXMLElement $feedItem): array
{
$item = [];
$item = [
'uri' => null,
'title' => null,
'content' => null,
'timestamp' => null,
'author' => null,
'enclosures' => [],
];
if (isset($feedItem->link)) {
// todo: trim uri
$item['uri'] = (string)$feedItem->link;