array( 'url' => array( 'name' => 'Thread URL', 'type' => 'text', 'required' => true, 'title' => 'Insert URL to the thread for which the feed should be generated', 'exampleValue' => 'https://xenforo.com/community/threads/guide-to-suggestions.2285/' ) ), 'global' => array( 'limit' => array( 'name' => 'Limit', 'type' => 'number', 'required' => false, 'title' => 'Specify maximum number of elements to return in the feed', 'defaultValue' => 10 ) ) ); const CACHE_TIMEOUT = 7200; // 10 minutes private $title = ''; private $threadurl = ''; private $version; // Holds the XenForo version public function getName() { switch($this->queriedContext) { case self::CONTEXT_THREAD: return $this->title . ' - ' . static::NAME; } return parent::getName(); } public function getURI() { switch($this->queriedContext) { case self::CONTEXT_THREAD: return $this->threadurl; } return parent::getURI(); } public function collectData() { $this->threadurl = filter_var( $this->getInput('url'), FILTER_VALIDATE_URL, FILTER_FLAG_PATH_REQUIRED); if($this->threadurl === false) { returnClientError('The URL you provided is invalid!'); } $urlparts = parse_url($this->threadurl, PHP_URL_SCHEME); // Scheme must be "http" or "https" if(preg_match('/http[s]{0,1}/', parse_url($this->threadurl, PHP_URL_SCHEME)) == false) { returnClientError('The URL you provided doesn\'t specify a valid scheme (http or https)!'); } // Path cannot be root (../) if(parse_url($this->threadurl, PHP_URL_PATH) === '/') { returnClientError('The URL you provided doesn\'t link to a valid thread (root path)!'); } // XenForo adds a thread ID to the URL, like "...-thread.454934283". It must be present if(preg_match('/.+\.\d+[\/]{0,1}/', parse_URL($this->threadurl, PHP_URL_PATH)) == false) { returnClientError('The URL you provided doesn\'t link to a valid thread (ID missing)!'); } // We want to start at the first page in the thread. XenForo uses "../page-n" syntax // to identify pages (except for the first page). // Notice: XenForo uses the concept of "sentinels" to find and replace parts in the // URL. Technically forum hosts can change the syntax! if(preg_match('/.+\/(page-\d+.*)$/', $this->threadurl, $matches) != false) { // before: https://xenforo.com/community/threads/guide-to-suggestions.2285/page-5 // after : https://xenforo.com/community/threads/guide-to-suggestions.2285/ $this->threadurl = str_replace($matches[1], '', $this->threadurl); } $html = getSimpleHTMLDOMCached($this->threadurl); $html = defaultLinkTo($html, $this->threadurl); // Notice: The DOM structure changes depending on the XenForo version used if($mainContent = $html->find('div.mainContent', 0)) { $this->version = self::XENFORO_VERSION_1; } elseif ($mainContent = $html->find('div[class~="p-body"]', 0)) { $this->version = self::XENFORO_VERSION_2; } else { returnServerError('This forum is currently not supported!'); } switch($this->version) { case self::XENFORO_VERSION_1: $titleBar = $mainContent->find('div.titleBar > h1', 0) or returnServerError('Error finding title bar!'); $this->title = $titleBar->plaintext; // Store items from current page (we'll use $this->items as LIFO buffer) $this->extractThreadPostsV1($html, $this->threadurl); $this->extractPagesV1($html); break; case self::XENFORO_VERSION_2: $titleBar = $mainContent->find('div[class~="p-title"] h1', 0) or returnServerError('Error finding title bar!'); $this->title = $titleBar->plaintext; $this->extractThreadPostsV2($html, $this->threadurl); $this->extractPagesV2($html); break; } while(count($this->items) > $this->getInput('limit')) { array_shift($this->items); } } /** * Extracts thread posts * @param $html A simplehtmldom object * @param $url The url from which $html was loaded */ private function extractThreadPostsV1($html, $url) { $lang = $html->find('html', 0)->lang; // Posts are contained in an "ol" $messageList = $html->find('#messageList > li') or returnServerError('Error finding message list!'); foreach($messageList as $post) { if(!isset($post->attr['id'])) { // Skip ads continue; } $item = array(); $item['uri'] = $url . '#' . $post->getAttribute('id'); $content = $post->find('.messageContent > article', 0); // Add some style to quotes foreach($content->find('.bbCodeQuote') as $quote) { $quote->style = ' color: #495566; background-color: rgb(248,251,253); border: 1px solid rgb(111, 140, 180); border-color: rgb(111, 140, 180); font-style: italic;'; } // Remove script tags foreach($content->find('script') as $script) { $script->outertext = ''; } $item['content'] = $content->innertext; // Remove quotes (for the title) foreach($content->find('.bbCodeQuote') as $quote) { $quote->innertext = ''; } $title = trim($content->plaintext); if(strlen($title) > 70) { $item['title'] = substr($title, 0, strpos($title, ' ', 70)) . '...'; } else { $item['title'] = $title; } /** * Timestamps are presented in two forms: * * 1) short version (for older posts?) * 22 Oct. 2018 * * This form has to be interpreted depending on the current language. * * 2) long version (for newer posts?) * Wednesday at 18:59 * * This form has the timestamp embedded (data-time) */ if($timestamp = $post->find('abbr.DateTime', 0)) { // long version (preffered) $item['timestamp'] = $timestamp->{'data-time'}; } elseif($timestamp = $post->find('span.DateTime', 0)) { // short version $item['timestamp'] = $this->fixDate($timestamp->title, $lang); } $item['author'] = $post->getAttribute('data-author'); // Bridge specific properties $item['id'] = $post->getAttribute('id'); $this->items[] = $item; } } private function extractThreadPostsV2($html, $url) { $lang = $html->find('html', 0)->lang; $messageList = $html->find('div[class~="block-body"] article') or returnServerError('Error finding message list!'); foreach($messageList as $post) { if(!isset($post->attr['id'])) { // Skip ads continue; } $item = array(); $item['uri'] = $url . '#' . $post->getAttribute('id'); $title = $post->find('div[class~="message-content"] article', 0)->plaintext; $end = strpos($title, ' ', 70); $item['title'] = substr($title, 0, $end); if ($post->find('time[datetime]', 0)) { $item['timestamp'] = $post->find('time[datetime]', 0)->datetime; } else { $item['timestamp'] = $this->fixDate($post->find('time', 0)->title, $lang); } $item['author'] = $post->getAttribute('data-author'); $item['content'] = $post->find('div[class~="message-content"] article', 0); // Bridge specific properties $item['id'] = $post->getAttribute('id'); $this->items[] = $item; } } private function extractPagesV1($html) { // A navigation bar becomes available if the number of posts grows too // high. When this happens we need to load further pages (from last backwards) if(($pageNav = $html->find('div.PageNav', 0))) { $lastpage = $pageNav->{'data-last'}; $baseurl = $pageNav->{'data-baseurl'}; $sentinel = $pageNav->{'data-sentinel'}; $hosturl = parse_url($this->threadurl, PHP_URL_SCHEME) . '://' . parse_url($this->threadurl, PHP_URL_HOST) . '/'; $page = $lastpage; // Load at least the last page do { $pageurl = str_replace($sentinel, $lastpage, $baseurl); // We can optimize performance by caching all but the last page if($page != $lastpage) { $html = getSimpleHTMLDOMCached($pageurl) or returnServerError('Error loading contents from ' . $pageurl . '!'); } else { $html = getSimpleHTMLDOM($pageurl) or returnServerError('Error loading contents from ' . $pageurl . '!'); } $html = defaultLinkTo($html, $hosturl); $this->extractThreadPostsV1($html, $pageurl); $page--; } while (count($this->items) < $this->getInput('limit') && $page != 1); } } private function extractPagesV2($html) { // A navigation bar becomes available if the number of posts grows too // high. When this happens we need to load further pages (from last backwards) if(($pageNav = $html->find('div.pageNav', 0))) { foreach($pageNav->find('li') as $nav) { $lastpage = $nav->plaintext; } // Manually extract baseurl and inject sentinel $baseurl = $pageNav->find('li > a', -1)->href; $baseurl = str_replace('page-' . $lastpage, 'page-{{sentinel}}', $baseurl); $sentinel = '{{sentinel}}'; $hosturl = parse_url($this->threadurl, PHP_URL_SCHEME) . '://' . parse_url($this->threadurl, PHP_URL_HOST); $page = $lastpage; // Load at least the last page do { $pageurl = str_replace($sentinel, $lastpage, $baseurl); // We can optimize performance by caching all but the last page if($page != $lastpage) { $html = getSimpleHTMLDOMCached($pageurl) or returnServerError('Error loading contents from ' . $pageurl . '!'); } else { $html = getSimpleHTMLDOM($pageurl) or returnServerError('Error loading contents from ' . $pageurl . '!'); } $html = defaultLinkTo($html, $hosturl); $this->extractThreadPostsV2($html, $pageurl); $page--; } while (count($this->items) < $this->getInput('limit') && $page != 1); } } /** * Fixes dates depending on the choosen language: * * de : dd.mm.yy * en : dd.mm.yy * it : dd/mm/yy * * Basically strtotime doesn't convert dates correctly due to formats * being hard to interpret. So we use the DateTime object. * * We don't know the timezone, so just assume +00:00 (or whatever * DateTime chooses) */ private function fixDate($date, $lang = 'en-US') { $mnamesen = array( 'January', 'Feburary', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ); switch($lang) { case 'en-US': // example: Jun 9, 2018 at 11:46 PM $df = date_create_from_format('M d, Y \a\t H:i A', $date); break; case 'de-DE': // example: 19 Juli 2018 um 19:27 Uhr $mnamesde = array( 'Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember' ); $mnamesdeshort = array( 'Jan.', 'Feb.', 'Mär.', 'Apr.', 'Mai', 'Juni', 'Juli', 'Aug.', 'Sep.', 'Okt.', 'Nov.', 'Dez.' ); $date = str_ireplace($mnamesde, $mnamesen, $date); $date = str_ireplace($mnamesdeshort, $mnamesen, $date); $df = date_create_from_format('d M Y \u\m H:i \U\h\r', $date); break; } // Debug::log(date_format($df, 'U')); return date_format($df, 'U'); } }