[ElsevierBridge] fix: broken bridge (#2575)

This commit is contained in:
dag 2022-03-31 09:49:30 +02:00 committed by GitHub
parent 3e363bbc20
commit 25e9f69261
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 18 additions and 55 deletions

View File

@ -1,7 +1,7 @@
<?php <?php
class ElsevierBridge extends BridgeAbstract { class ElsevierBridge extends BridgeAbstract {
const MAINTAINER = 'Pierre Mazière'; const MAINTAINER = 'dvikan';
const NAME = 'Elsevier journals recent articles'; const NAME = 'Elsevier journals recent articles';
const URI = 'https://www.journals.elsevier.com/'; const URI = 'https://www.journals.elsevier.com/';
const CACHE_TIMEOUT = 43200; //12h const CACHE_TIMEOUT = 43200; //12h
@ -16,63 +16,26 @@ class ElsevierBridge extends BridgeAbstract {
) )
)); ));
// Extracts the list of names from an article as string
private function extractArticleName($article){
$names = $article->find('small', 0);
if($names)
return trim($names->plaintext);
return '';
}
// Extracts the timestamp from an article
private function extractArticleTimestamp($article){
$time = $article->find('.article-info', 0);
if($time) {
$timestring = trim($time->plaintext);
/*
The format depends on the age of an article:
- Available online 29 July 2016
- July 2016
- MayJune 2016
*/
if(preg_match('/\S*(\d+\s\S+\s\d{4})/ims', $timestring, $matches)) {
return strtotime($matches[0]);
} elseif (preg_match('/[A-Za-z]+\-([A-Za-z]+\s\d{4})/ims', $timestring, $matches)) {
return strtotime($matches[0]);
} elseif (preg_match('/([A-Za-z]+\s\d{4})/ims', $timestring, $matches)) {
return strtotime($matches[0]);
} else {
return 0;
}
}
return 0;
}
// Extracts the content from an article
private function extractArticleContent($article){
$content = $article->find('.article-content', 0);
if($content) {
return trim($content->plaintext);
}
return '';
}
public function getIcon() {
return 'https://cdn.elsevier.io/verona/includes/favicons/favicon-32x32.png';
}
public function collectData(){ public function collectData(){
$uri = self::URI . $this->getInput('j') . '/recent-articles/'; // Not all journals have the /recent-articles page
$html = getSimpleHTMLDOM($uri); $url = sprintf('https://www.journals.elsevier.com/%s/recent-articles/', $this->getInput('j'));
$html = getSimpleHTMLDOM($url);
foreach($html->find('.pod-listing') as $article) { foreach($html->find('article') as $recentArticle) {
$item = array(); $item = [];
$item['uri'] = $article->find('.pod-listing-header>a', 0)->getAttribute('href') . '?np=y'; $item['uri'] = $recentArticle->find('a', 0)->getAttribute('href');
$item['title'] = $article->find('.pod-listing-header>a', 0)->plaintext; $item['title'] = $recentArticle->find('h2', 0)->plaintext;
$item['author'] = $this->extractArticleName($article); $item['author'] = $recentArticle->find('p > span', 0)->plaintext;
$item['timestamp'] = $this->extractArticleTimestamp($article); $publicationDateString = trim($recentArticle->find('p > span', 1)->plaintext);
$item['content'] = $this->extractArticleContent($article); $publicationDate = DateTimeImmutable::createFromFormat('F d, Y', $publicationDateString);
if ($publicationDate) {
$item['timestamp'] = $publicationDate->getTimestamp();
}
$this->items[] = $item; $this->items[] = $item;
} }
} }
public function getIcon(): string {
return 'https://cdn.elsevier.io/verona/includes/favicons/favicon-32x32.png';
}
} }