[GoogleSearch] fix: improve bridge (#2848)

Sort properly by date.
Fix php errors.
Improve the date parsing logic.
Improve the content parsing by also including those with · as separator
This commit is contained in:
Dag 2022-06-22 18:32:54 +02:00 committed by GitHub
parent e7aebb223d
commit 9d5f59e2db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 60 additions and 40 deletions

View File

@ -1,19 +1,12 @@
<?php <?php
/**
* Returns the 100 most recent links in results in past year, sorting by date (most recent first).
* Example:
* http://www.google.com/search?q=sebsauvage&num=100&complete=0&tbs=qdr:y,sbd:1
* complete=0&num=100 : get 100 results
* qdr:y : in past year
* sbd:1 : sort by date (will only work if qdr: is specified)
*/
class GoogleSearchBridge extends BridgeAbstract { class GoogleSearchBridge extends BridgeAbstract {
const MAINTAINER = 'sebsauvage'; const MAINTAINER = 'sebsauvage';
const NAME = 'Google search'; const NAME = 'Google search';
const URI = 'https://www.google.com/'; const URI = 'https://www.google.com/';
const CACHE_TIMEOUT = 1800; // 30min const CACHE_TIMEOUT = 1800; // 30min
const DESCRIPTION = 'Returns most recent results from Google search.'; const DESCRIPTION = 'Returns max 100 results from the past year.';
const PARAMETERS = array(array( const PARAMETERS = array(array(
'q' => array( 'q' => array(
@ -24,43 +17,70 @@ class GoogleSearchBridge extends BridgeAbstract {
)); ));
public function collectData(){ public function collectData(){
$header = array('Accept-language: en-US'); $dom = getSimpleHTMLDOM($this->getURI(), ['Accept-language: en-US']);
$html = getSimpleHTMLDOM($this->getURI(), $header) if (!$dom) {
or returnServerError('No results for this query.'); returnServerError('No results for this query.');
}
$result = $dom->find('div[id=res]', 0);
$emIsRes = $html->find('div[id=res]', 0); if(!$result) {
return;
}
if(!is_null($emIsRes)) { foreach ($result->find('div[class~=g]') as $element) {
foreach($emIsRes->find('div[class~=g]') as $element) { $item = [];
$item = array();
$t = $element->find('a[href]', 0)->href; $url = $element->find('a[href]', 0)->href;
$item['uri'] = htmlspecialchars_decode($t); $item['uri'] = htmlspecialchars_decode($url);
$item['title'] = $element->find('h3', 0)->plaintext; $item['title'] = $element->find('h3', 0)->plaintext;
$resultComponents = explode(' — ', $element->find('div[data-content-feature=1]', 0)->plaintext);
$item['content'] = $resultComponents[1];
if(strpos($resultComponents[0], 'day') === true) { $resultDom = $element->find('div[data-content-feature=1]', 0);
$daysago = explode(' ', $resultComponents[0])[0]; if ($resultDom) {
$item['timestamp'] = date('d M Y', strtotime('-' . $daysago . ' days')); // Split by — or ·
$resultParts = preg_split('/( — | · )/', $resultDom->plaintext);
$resultDate = trim($resultParts[0]);
$resultContent = trim($resultParts[1] ?? '');
} else { } else {
$item['timestamp'] = $resultComponents[0]; // Some search results don't have this particular dom identifier
$resultDate = null;
$resultContent = null;
}
if ($resultDate) {
try {
$createdAt = new \DateTime($resultDate);
// Set to midnight for consistent datetime
$createdAt->setTime(0, 0);
$item['timestamp'] = $createdAt->format('U');
} catch (\Exception $e) {
$item['timestamp'] = 0;
}
} else {
$item['timestamp'] = 0;
}
if ($resultContent) {
$item['content'] = $resultContent;
} }
$this->items[] = $item; $this->items[] = $item;
} }
} // Sort by descending date
usort($this->items, function($a, $b) { usort($this->items, function($a, $b) {
return $a['timestamp'] < $b['timestamp']; return $b['timestamp'] <=> $a['timestamp'];
}); });
} }
public function getURI() { public function getURI() {
if (!is_null($this->getInput('q'))) { if ($this->getInput('q')) {
return self::URI $queryParameters = [
. 'search?q=' 'q' => $this->getInput('q'),
. urlencode($this->getInput('q')) 'hl' => 'en',
. '&hl=en&num=100&complete=0&tbs=qdr:y,sbd:1'; 'num' => '100', // get 100 results
'complete' => '0',
'tbs' => 'qdr:y,sbd:1' // in past year, sort by date
];
return sprintf('https://www.google.com/search?%s', http_build_query($queryParameters));
} }
return parent::getURI(); return parent::getURI();