From b037d1b4d1f0b0f422e21125ddef00a58e185ed1 Mon Sep 17 00:00:00 2001 From: Matt DeMoss Date: Tue, 21 Nov 2023 11:00:02 -0500 Subject: [PATCH] [Threads] add bridge (#3805) * initial working Threads bridge * properly specify a default limit * phpcs formatted --- bridges/ThreadsBridge.php | 120 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 bridges/ThreadsBridge.php diff --git a/bridges/ThreadsBridge.php b/bridges/ThreadsBridge.php new file mode 100644 index 00000000..b7e5cd1a --- /dev/null +++ b/bridges/ThreadsBridge.php @@ -0,0 +1,120 @@ + [ + 'u' => [ + 'name' => 'username', + 'required' => true, + 'exampleValue' => 'zuck', + 'title' => 'Insert a user name' + ], + 'limit' => [ + 'name' => 'Limit', + 'type' => 'number', + 'required' => false, + 'title' => 'Specify number of posts to fetch', + 'defaultValue' => 5 + ] + ] + ]; + + protected $feedName = self::NAME; + public function getName() + { + return $this->feedName; + } + + public function detectParameters($url) + { + // By username + $regex = '/^(https?:\/\/)?(www\.)?threads\.net\/(@)?([^\/?\n]+)/'; + if (preg_match($regex, $url, $matches) > 0) { + $params['context'] = 'By username'; + $params['u'] = urldecode($matches[3]); + return $params; + } + return null; + } + + public function getURI() + { + return self::URI . '@' . $this->getInput('u'); + } + + // https://stackoverflow.com/a/3975706/421140 + // Found this in FlaschenpostBridge, modified to return an array and take an object. + private function recursiveFind($haystack, $needle) + { + $found = []; + $iterator = new \RecursiveArrayIterator($haystack); + $recursive = new \RecursiveIteratorIterator( + $iterator, + \RecursiveIteratorIterator::SELF_FIRST + ); + foreach ($recursive as $key => $value) { + if ($key === $needle) { + $found[] = $value; + } + } + return $found; + } + + public function collectData() + { + $html = getSimpleHTMLDOMCached($this->getURI(), static::CACHE_TIMEOUT); + Debug::log(sprintf('Fetched: %s', $this->getURI())); + $jsonBlobs = $html->find('script[type="application/json"]'); + Debug::log(sprintf('%d JSON blobs found.', count($jsonBlobs))); + $gatheredCodes = []; + $limit = $this->getInput('limit'); + foreach ($jsonBlobs as $jsonBlob) { + // The structure of the JSON document is likely to change, but we're looking for a "code" inside a "post" + foreach ($this->recursiveFind($this->recursiveFind(json_decode($jsonBlob->innertext), 'post'), 'code') as $candidateCode) { + // code should be like CzZk4-USq1O or Cy3m1VnRiwP or Cywjyrdv9T6 or CzZk4-USq1O + if (grapheme_strlen($candidateCode) == 11 and !in_array($candidateCode, $gatheredCodes)) { + $gatheredCodes[] = $candidateCode; + if (count($gatheredCodes) >= $limit) { + break 2; + } + } + } + } + Debug::log(sprintf('Candidate codes found in JSON in script tags: %s', print_r($gatheredCodes, true))); + + $this->feedName = html_entity_decode($html->find('meta[property=og:title]', 0)->content); + // todo: meta[property=og:description] could populate the feed description + + foreach ($gatheredCodes as $postCode) { + $item = []; + // post URL is like: https://www.threads.net/@zuck/post/Czrr520PZfh + $item['uri'] = $this->getURI() . '/post/' . $postCode; + $articleHtml = getSimpleHTMLDOMCached($item['uri'], 15778800); // cache time: six months + + // Relying on meta tags ought to be more reliable. + if ($articleHtml->find('meta[property=og:type]', 0)->content != 'article') { + continue; + } + $item['title'] = $articleHtml->find('meta[property=og:description]', 0)->content; + $item['content'] = $articleHtml->find('meta[property=og:description]', 0)->content; + $item['author'] = html_entity_decode($articleHtml->find('meta[property=og:title]', 0)->content); + + $imageUrl = $articleHtml->find('meta[property=og:image]', 0); + if ($imageUrl) { + $item['enclosures'][] = html_entity_decode($imageUrl->content); + } + + // todo: parse hashtags out of content for $item['categories'] + // todo: try to scrape out a timestamp for $item['timestamp'], it's not in the meta tags + + $this->items[] = $item; + } + } +}