[YoutubeBridge] handle new youtube description system / fix missing description (#3682)

* [YoutubeBridge] handle new youtube description system

* [YoutubeBridge] fix unrelated warnings

* [YoutubeBridge] discard everything when one link can not be matched & add more boundary chars

* [YoutubeBridge] rebase on master & minor fixes
This commit is contained in:
User123698745 2023-09-22 05:40:13 +02:00 committed by GitHub
parent 7329b83cc0
commit 7a9bfa1087
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 167 additions and 57 deletions

View File

@ -10,7 +10,7 @@
class YoutubeBridge extends BridgeAbstract
{
const NAME = 'YouTube Bridge';
const URI = 'https://www.youtube.com/';
const URI = 'https://www.youtube.com';
const CACHE_TIMEOUT = 10800; // 3h
const DESCRIPTION = 'Returns the 10 newest videos by username/channel/playlist or search';
@ -74,7 +74,7 @@ class YoutubeBridge extends BridgeAbstract
private $feedName = '';
private $feeduri = '';
private $channel_name = '';
private $feedIconUrl = '';
// This took from repo BetterVideoRss of VerifiedJoseph.
const URI_REGEX = '/(https?:\/\/(?:www\.)?(?:[a-zA-Z0-9-.]{2,256}\.[a-z]{2,20})(\:[0-9]{2 ,4})?(?:\/[a-zA-Z0-9@:%_\+.,~#"\'!?&\/\/=\-*]+|\/)?)/ims'; //phpcs:ignore
@ -87,16 +87,16 @@ class YoutubeBridge extends BridgeAbstract
if ($this->getInput('u')) {
/* User and Channel modes */
$this->request = $this->getInput('u');
$url_feed = self::URI . 'feeds/videos.xml?user=' . urlencode($this->request);
$url_listing = self::URI . 'user/' . urlencode($this->request) . '/videos';
$request = $this->getInput('u');
$url_feed = self::URI . '/feeds/videos.xml?user=' . urlencode($request);
$url_listing = self::URI . '/user/' . urlencode($request) . '/videos';
} elseif ($this->getInput('c')) {
$this->request = $this->getInput('c');
$url_feed = self::URI . 'feeds/videos.xml?channel_id=' . urlencode($this->request);
$url_listing = self::URI . 'channel/' . urlencode($this->request) . '/videos';
$request = $this->getInput('c');
$url_feed = self::URI . '/feeds/videos.xml?channel_id=' . urlencode($request);
$url_listing = self::URI . '/channel/' . urlencode($request) . '/videos';
} elseif ($this->getInput('custom')) {
$this->request = $this->getInput('custom');
$url_listing = self::URI . urlencode($this->request) . '/videos';
$request = $this->getInput('custom');
$url_listing = self::URI . '/' . urlencode($request) . '/videos';
}
if (!empty($url_feed) || !empty($url_listing)) {
@ -105,7 +105,7 @@ class YoutubeBridge extends BridgeAbstract
$html = $this->ytGetSimpleHTMLDOM($url_listing);
$jsonData = $this->getJSONData($html);
$url_feed = $jsonData->metadata->channelMetadataRenderer->rssUrl;
$this->iconURL = $jsonData->metadata->channelMetadataRenderer->avatar->thumbnails[0]->url;
$this->feedIconUrl = $jsonData->metadata->channelMetadataRenderer->avatar->thumbnails[0]->url;
}
if (!$this->skipFeeds()) {
$html = $this->ytGetSimpleHTMLDOM($url_feed);
@ -123,7 +123,7 @@ class YoutubeBridge extends BridgeAbstract
// $jsonData = $jsonData->itemSectionRenderer->contents[0]->gridRenderer->items;
$this->parseJSONListing($jsonData);
} else {
returnServerError('Unable to get data from YouTube. Username/Channel: ' . $this->request);
returnServerError('Unable to get data from YouTube. Username/Channel: ' . $request);
}
}
$this->feedName = str_replace(' - YouTube', '', $html->find('title', 0)->plaintext);
@ -133,9 +133,9 @@ class YoutubeBridge extends BridgeAbstract
// To make less requests, we need to cache following dictionary "videoId -> datePublished, duration"
// This cache will be used to find out, which videos to fetch
// to make feed of 15 items or more, if there a lot of videos published on that date.
$this->request = $this->getInput('p');
$url_feed = self::URI . 'feeds/videos.xml?playlist_id=' . urlencode($this->request);
$url_listing = self::URI . 'playlist?list=' . urlencode($this->request);
$request = $this->getInput('p');
$url_feed = self::URI . '/feeds/videos.xml?playlist_id=' . urlencode($request);
$url_listing = self::URI . '/playlist?list=' . urlencode($request);
$html = $this->ytGetSimpleHTMLDOM($url_listing);
$jsonData = $this->getJSONData($html);
// TODO: this method returns only first 100 video items
@ -160,10 +160,10 @@ class YoutubeBridge extends BridgeAbstract
});
} elseif ($this->getInput('s')) {
/* search mode */
$this->request = $this->getInput('s');
$request = $this->getInput('s');
$url_listing = self::URI
. 'results?search_query='
. urlencode($this->request)
. '/results?search_query='
. urlencode($request)
. '&sp=CAI%253D';
$html = $this->ytGetSimpleHTMLDOM($url_listing);
@ -180,7 +180,7 @@ class YoutubeBridge extends BridgeAbstract
}
$this->parseJSONListing($jsonData);
$this->feeduri = $url_listing;
$this->feedName = 'Search: ' . $this->request;
$this->feedName = 'Search: ' . $request;
} else {
/* no valid mode */
returnClientError("You must either specify either:\n - YouTube
@ -206,7 +206,7 @@ class YoutubeBridge extends BridgeAbstract
private function ytBridgeQueryVideoInfo($vid, &$author, &$desc, &$time)
{
$html = $this->ytGetSimpleHTMLDOM(self::URI . "watch?v=$vid", true);
$html = $this->ytGetSimpleHTMLDOM(self::URI . "/watch?v=$vid", true);
// Skip unavailable videos
if (strpos($html->innertext, 'IS_UNAVAILABLE_PAGE') !== false) {
@ -224,7 +224,7 @@ class YoutubeBridge extends BridgeAbstract
}
$jsonData = $this->getJSONData($html);
if (! isset($jsonData->contents)) {
if (!isset($jsonData->contents)) {
return;
}
@ -240,34 +240,149 @@ class YoutubeBridge extends BridgeAbstract
returnServerError('Could not find videoSecondaryInfoRenderer. Error at: ' . $vid);
}
if (isset($videoSecondaryInfo->description)) {
foreach ($videoSecondaryInfo->description->runs as $description) {
if (isset($description->navigationEndpoint)) {
$metadata = $description->navigationEndpoint->commandMetadata->webCommandMetadata;
$web_type = $metadata->webPageType;
$url = $metadata->url;
$text = '';
switch ($web_type) {
case 'WEB_PAGE_TYPE_UNKNOWN':
$url_components = parse_url($url);
if (isset($url_components['query']) && strpos($url_components['query'], '&q=') !== false) {
parse_str($url_components['query'], $params);
$url = urldecode($params['q']);
}
$text = $url;
break;
case 'WEB_PAGE_TYPE_WATCH':
case 'WEB_PAGE_TYPE_BROWSE':
$url = 'https://www.youtube.com' . $url;
$text = $description->text;
break;
}
$desc .= "<a href=\"$url\" target=\"_blank\">$text</a>";
} else {
$desc .= nl2br($description->text);
}
$desc = $videoSecondaryInfo->attributedDescription->content ?? '';
// Default whitespace chars used by trim + non-breaking spaces (https://en.wikipedia.org/wiki/Non-breaking_space)
$whitespaceChars = " \t\n\r\0\x0B\u{A0}\u{2060}\u{202F}\u{2007}";
$descEnhancements = $this->ytBridgeGetVideoDescriptionEnhancements($videoSecondaryInfo, $desc, self::URI, $whitespaceChars);
foreach ($descEnhancements as $descEnhancement) {
if (isset($descEnhancement['url'])) {
$descBefore = mb_substr($desc, 0, $descEnhancement['pos']);
$descValue = mb_substr($desc, $descEnhancement['pos'], $descEnhancement['len']);
$descAfter = mb_substr($desc, $descEnhancement['pos'] + $descEnhancement['len'], null);
// Extended trim for the display value of internal links, e.g.:
// FAVICON • Video Name
// FAVICON / @ChannelName
$descValue = trim($descValue, $whitespaceChars . '•/');
$desc = sprintf('%s<a href="%s" target="_blank">%s</a>%s', $descBefore, $descEnhancement['url'], $descValue, $descAfter);
}
}
$desc = nl2br($desc);
}
private function ytBridgeGetVideoDescriptionEnhancements(
object $videoSecondaryInfo,
string $descriptionContent,
string $baseUrl,
string $whitespaceChars
): array {
$commandRuns = $videoSecondaryInfo->attributedDescription->commandRuns ?? [];
if (count($commandRuns) <= 0) {
return [];
}
$enhancements = [];
$boundaryWhitespaceChars = mb_str_split($whitespaceChars);
$boundaryStartChars = array_merge($boundaryWhitespaceChars, [':', '-', '(']);
$boundaryEndChars = array_merge($boundaryWhitespaceChars, [',', '.', "'", ')']);
$hashtagBoundaryEndChars = array_merge($boundaryEndChars, ['#', '-']);
$descriptionContentLength = mb_strlen($descriptionContent);
$minPositionOffset = 0;
$prevStartPosition = 0;
$totalLength = 0;
$maxPositionByStartIndex = [];
foreach (array_reverse($commandRuns) as $commandRun) {
$endPosition = $commandRun->startIndex + $commandRun->length;
if ($endPosition < $prevStartPosition) {
$totalLength += 1;
}
$totalLength += $commandRun->length;
$maxPositionByStartIndex[$commandRun->startIndex] = $totalLength;
$prevStartPosition = $commandRun->startIndex;
}
foreach ($commandRuns as $commandRun) {
$commandMetadata = $commandRun->onTap->innertubeCommand->commandMetadata->webCommandMetadata ?? null;
if (!isset($commandMetadata)) {
continue;
}
$enhancement = null;
/*
$commandRun->startIndex can be offset by few positions in the positive direction
when some multibyte characters (e.g. emojis, but maybe also others) are used in the plain text video description.
(probably some difference between php and javascript in handling multibyte characters)
This loop should correct the position in most cases. It searches for the next word (determined by a set of boundary chars) with the expected length.
Several safeguards ensure that the correct word is chosen. When a link can not be matched,
everything will be discarded to prevent corrupting the description.
Hashtags require a different set of boundary chars.
*/
$isHashtag = $commandMetadata->webPageType === 'WEB_PAGE_TYPE_BROWSE';
$prevEnhancement = end($enhancements);
$minPosition = $prevEnhancement === false ? 0 : $prevEnhancement['pos'] + $prevEnhancement['len'];
$maxPosition = $descriptionContentLength - $maxPositionByStartIndex[$commandRun->startIndex];
$position = min($commandRun->startIndex - $minPositionOffset, $maxPosition);
while ($position >= $minPosition) {
// The link display value can only ever include a new line at the end (which will be removed further below), never in between.
$newLinePosition = mb_strpos($descriptionContent, "\n", $position);
if ($newLinePosition !== false && $newLinePosition < $position + ($commandRun->length - 1)) {
$position = $newLinePosition - ($commandRun->length - 1);
continue;
}
$firstChar = mb_substr($descriptionContent, $position, 1);
$boundaryStart = mb_substr($descriptionContent, $position - 1, 1);
$boundaryEndIndex = $position + $commandRun->length;
$boundaryEnd = mb_substr($descriptionContent, $boundaryEndIndex, 1);
$boundaryStartIsValid = $position === 0 ||
in_array($boundaryStart, $boundaryStartChars) ||
($isHashtag && $firstChar === '#');
$boundaryEndIsValid = $boundaryEndIndex === $descriptionContentLength ||
in_array($boundaryEnd, $isHashtag ? $hashtagBoundaryEndChars : $boundaryEndChars);
if ($boundaryStartIsValid && $boundaryEndIsValid) {
$minPositionOffset = $commandRun->startIndex - $position;
$enhancement = [
'pos' => $position,
'len' => $commandRun->length,
];
break;
}
$position--;
}
if (!isset($enhancement)) {
$this->logger->debug(sprintf('Position %d cannot be corrected in "%s"', $commandRun->startIndex, substr($descriptionContent, 0, 50) . '...'));
// Skip to prevent the description from becoming corrupted
continue;
}
// $commandRun->length sometimes incorrectly includes the newline as last char
$lastChar = mb_substr($descriptionContent, $enhancement['pos'] + $enhancement['len'] - 1, 1);
if ($lastChar === "\n") {
$enhancement['len'] -= 1;
}
$commandUrl = parse_url($commandMetadata->url);
if ($commandUrl['path'] === '/redirect') {
parse_str($commandUrl['query'], $commandUrlQuery);
$enhancement['url'] = urldecode($commandUrlQuery['q']);
} else if (isset($commandUrl['host'])) {
$enhancement['url'] = $commandMetadata->url;
} else {
$enhancement['url'] = $baseUrl . $commandMetadata->url;
}
$enhancements[] = $enhancement;
}
if (count($enhancements) !== count($commandRuns)) {
// At least one link can not be matched. Discard everything to prevent corrupting the description.
return [];
}
// Sort by position in descending order to be able to safely replace values
return array_reverse($enhancements);
}
private function ytBridgeAddItem($vid, $title, $author, $desc, $time, $thumbnail = '')
@ -277,12 +392,12 @@ class YoutubeBridge extends BridgeAbstract
$item['title'] = $title;
$item['author'] = $author;
$item['timestamp'] = $time;
$item['uri'] = self::URI . 'watch?v=' . $vid;
$item['uri'] = self::URI . '/watch?v=' . $vid;
if (!$thumbnail) {
// Fallback to default thumbnail if there aren't any provided.
$thumbnail = '0';
}
$thumbnailUri = str_replace('/www.', '/img.', self::URI) . 'vi/' . $vid . '/' . $thumbnail . '.jpg';
$thumbnailUri = str_replace('/www.', '/img.', self::URI) . '/vi/' . $vid . '/' . $thumbnail . '.jpg';
$item['content'] = '<a href="' . $item['uri'] . '"><img src="' . $thumbnailUri . '" /></a><br />' . $desc;
$this->items[] = $item;
}
@ -398,11 +513,6 @@ class YoutubeBridge extends BridgeAbstract
$vid = $wrapper->videoId;
$title = $wrapper->title->runs[0]->text;
if (isset($wrapper->ownerText)) {
$this->channel_name = $wrapper->ownerText->runs[0]->text;
} elseif (isset($wrapper->shortBylineText)) {
$this->channel_name = $wrapper->shortBylineText->runs[0]->text;
}
$author = '';
$desc = '';
@ -450,7 +560,7 @@ class YoutubeBridge extends BridgeAbstract
public function getURI()
{
if (!is_null($this->getInput('p'))) {
return static::URI . 'playlist?list=' . $this->getInput('p');
return static::URI . '/playlist?list=' . $this->getInput('p');
} elseif ($this->feeduri) {
return $this->feeduri;
}
@ -474,10 +584,10 @@ class YoutubeBridge extends BridgeAbstract
public function getIcon()
{
if (empty($this->iconURL)) {
if (empty($this->feedIconUrl)) {
return parent::getIcon();
} else {
return $this->iconURL;
return $this->feedIconUrl;
}
}
}