From 9d5f59e2db6ec631eb51b4c3e442fe59b4616ee1 Mon Sep 17 00:00:00 2001 From: Dag Date: Wed, 22 Jun 2022 18:32:54 +0200 Subject: [PATCH] [GoogleSearch] fix: improve bridge (#2848) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sort properly by date. Fix php errors. Improve the date parsing logic. Improve the content parsing by also including those with · as separator --- bridges/GoogleSearchBridge.php | 100 ++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 40 deletions(-) diff --git a/bridges/GoogleSearchBridge.php b/bridges/GoogleSearchBridge.php index 17727d6f..e5b80dae 100644 --- a/bridges/GoogleSearchBridge.php +++ b/bridges/GoogleSearchBridge.php @@ -1,19 +1,12 @@ array( @@ -24,43 +17,70 @@ class GoogleSearchBridge extends BridgeAbstract { )); public function collectData(){ - $header = array('Accept-language: en-US'); - $html = getSimpleHTMLDOM($this->getURI(), $header) - or returnServerError('No results for this query.'); - - $emIsRes = $html->find('div[id=res]', 0); - - if(!is_null($emIsRes)) { - foreach($emIsRes->find('div[class~=g]') as $element) { - $item = array(); - - $t = $element->find('a[href]', 0)->href; - $item['uri'] = htmlspecialchars_decode($t); - $item['title'] = $element->find('h3', 0)->plaintext; - $resultComponents = explode(' — ', $element->find('div[data-content-feature=1]', 0)->plaintext); - $item['content'] = $resultComponents[1]; - - if(strpos($resultComponents[0], 'day') === true) { - $daysago = explode(' ', $resultComponents[0])[0]; - $item['timestamp'] = date('d M Y', strtotime('-' . $daysago . ' days')); - } else { - $item['timestamp'] = $resultComponents[0]; - } - - $this->items[] = $item; - } + $dom = getSimpleHTMLDOM($this->getURI(), ['Accept-language: en-US']); + if (!$dom) { + returnServerError('No results for this query.'); } + $result = $dom->find('div[id=res]', 0); + + if(!$result) { + return; + } + + foreach ($result->find('div[class~=g]') as $element) { + $item = []; + + $url = $element->find('a[href]', 0)->href; + $item['uri'] = htmlspecialchars_decode($url); + $item['title'] = $element->find('h3', 0)->plaintext; + + $resultDom = $element->find('div[data-content-feature=1]', 0); + if ($resultDom) { + // Split by — or · + $resultParts = preg_split('/( — | · )/', $resultDom->plaintext); + $resultDate = trim($resultParts[0]); + $resultContent = trim($resultParts[1] ?? ''); + } else { + // Some search results don't have this particular dom identifier + $resultDate = null; + $resultContent = null; + } + + if ($resultDate) { + try { + $createdAt = new \DateTime($resultDate); + // Set to midnight for consistent datetime + $createdAt->setTime(0, 0); + $item['timestamp'] = $createdAt->format('U'); + } catch (\Exception $e) { + $item['timestamp'] = 0; + } + } else { + $item['timestamp'] = 0; + } + + if ($resultContent) { + $item['content'] = $resultContent; + } + + $this->items[] = $item; + } + // Sort by descending date usort($this->items, function($a, $b) { - return $a['timestamp'] < $b['timestamp']; + return $b['timestamp'] <=> $a['timestamp']; }); } public function getURI() { - if (!is_null($this->getInput('q'))) { - return self::URI - . 'search?q=' - . urlencode($this->getInput('q')) - . '&hl=en&num=100&complete=0&tbs=qdr:y,sbd:1'; + if ($this->getInput('q')) { + $queryParameters = [ + 'q' => $this->getInput('q'), + 'hl' => 'en', + 'num' => '100', // get 100 results + 'complete' => '0', + 'tbs' => 'qdr:y,sbd:1' // in past year, sort by date + ]; + return sprintf('https://www.google.com/search?%s', http_build_query($queryParameters)); } return parent::getURI();