From 4857cdbedc4639403d2cfc731a3a191908dbee98 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Tue, 4 Feb 2014 17:53:59 +0100 Subject: [PATCH 01/18] ajout des scriptsd e Superbaillot --- bridges/Dilbert.php | 43 ++++++++++++++++++++++++++++ bridges/LesJoiesDuCode.php | 57 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 bridges/Dilbert.php create mode 100644 bridges/LesJoiesDuCode.php diff --git a/bridges/Dilbert.php b/bridges/Dilbert.php new file mode 100644 index 00000000..893747b6 --- /dev/null +++ b/bridges/Dilbert.php @@ -0,0 +1,43 @@ +returnError('Could not request Dilbert.', 404); + + foreach($html->find('div.STR_Image') as $element) { + $item = new Item(); + $href = $element->find('a',0)->href; + $item->uri = 'http://dilbert.com' . $href; + $content = str_replace('src="/', 'src="http://dilbert.com/',$element->innertext); + $content = str_replace('href="/', 'href="http://dilbert.com/',$content); + $item->content = $content; + $time = strtotime(substr($href, (strrpos($href, "/", -10) + 1), 10)); + $item->title = date("d/m/Y", $time); + $item->timestamp = $time; + $this->items[] = $item; + } + } + + public function getName(){ + return 'Dilbert'; + } + + public function getURI(){ + return 'http://dilbert.com'; + } + + public function getDescription(){ + return 'Dilbert via rss-bridge'; + } + + public function getCacheDuration(){ + return 14400; // 4 hours + } +} +?> diff --git a/bridges/LesJoiesDuCode.php b/bridges/LesJoiesDuCode.php new file mode 100644 index 00000000..a14d9f6d --- /dev/null +++ b/bridges/LesJoiesDuCode.php @@ -0,0 +1,57 @@ +returnError('Could not request LesJoiesDuCode.', 404); + + foreach($html->find('div.post') as $element) { + $item = new Item(); + $temp = $element->find('h3 a', 0); + + $titre = $temp->innertext; + $url = $temp->href; + + $temp = $element->find('div.bodytype', 0); + $content = $temp->innertext; + + $auteur = $temp->find('.c1 em', 0); + $pos = strpos($auteur->innertext, "by"); + + if($pos > 0) + { + $auteur = trim(str_replace("*/", "", substr($auteur->innertext, ($pos + 2)))); + $item->name = $auteur; + } + + + $item->content .= trim($content); + $item->uri = $url; + $item->title = trim($titre); + + $this->items[] = $item; + } + } + + public function getName(){ + return 'Les Joies Du Code'; + } + + public function getURI(){ + return 'http://lesjoiesducode.fr/'; + } + + public function getCacheDuration(){ + return 7200; // 2h hours + } + public function getDescription(){ + return "Les Joies Du Code via rss-bridge"; + } +} +?> + From 13232266778c61e52aab3b37ee7874bfa574f5d2 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Tue, 4 Feb 2014 17:54:18 +0100 Subject: [PATCH 02/18] ajout du script pour Sexactu --- bridges/Sexactu.php | 58 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 bridges/Sexactu.php diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php new file mode 100644 index 00000000..5abb0325 --- /dev/null +++ b/bridges/Sexactu.php @@ -0,0 +1,58 @@ +returnError('Could not request http://www.gqmagazine.fr/sexactu.', 404); + + foreach($html->find('div.content-holder ul li') as $element) { + $item = new Item(); + $temp = $element->find('h3 a', 0); + + $titreElement = $element->find('.title-holder .article-title a'); + $titre = $titreElement-> + $url = $temp->href; + + $temp = $element->find('div.text-container', 0); + $content = $temp->innertext; + + $auteur = $temp->find('.c1 em', 0); + $pos = strpos($auteur->innertext, "by"); + + if($pos > 0) + { + $auteur = trim(str_replace("*/", "", substr($auteur->innertext, ($pos + 2)))); + $item->name = $auteur; + } + + + $item->content .= trim($content); + $item->uri = $url; + $item->title = trim($titre); + + $this->items[] = $item; + } + } + + public function getName(){ + return 'Sexactu'; + } + + public function getURI(){ + return 'http://http://www.gqmagazine.fr/sexactu/'; + } + + public function getCacheDuration(){ + return 7200; // 2h hours + } + public function getDescription(){ + return "Sexactu via rss-bridge"; + } +} +?> + From 4f1d4137d4221d12b3a59987bc6adb7e47b97228 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Tue, 4 Feb 2014 18:00:11 +0100 Subject: [PATCH 03/18] premire modification qui ne marche pas --- bridges/Sexactu.php | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php index 5abb0325..37b16c18 100644 --- a/bridges/Sexactu.php +++ b/bridges/Sexactu.php @@ -12,17 +12,16 @@ class LesJoiesDuCodeBridge extends BridgeAbstract{ foreach($html->find('div.content-holder ul li') as $element) { $item = new Item(); - $temp = $element->find('h3 a', 0); $titreElement = $element->find('.title-holder .article-title a'); - $titre = $titreElement-> - $url = $temp->href; + $titre = $titreElement->innertext + $url = $titreElement->href; $temp = $element->find('div.text-container', 0); $content = $temp->innertext; - $auteur = $temp->find('.c1 em', 0); - $pos = strpos($auteur->innertext, "by"); + $auteur = $element->find('div.header-holder', 0); + $pos = strpos($auteur->innertext, "par"); if($pos > 0) { From dbe9ae44dfdf307b44c65b554c98c5f40774ff77 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Wed, 5 Feb 2014 10:16:45 +0100 Subject: [PATCH 04/18] Correctiond e quelques boulettes, mais ca ne marche toujours pas --- bridges/Sexactu.php | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php index 37b16c18..9153afc7 100644 --- a/bridges/Sexactu.php +++ b/bridges/Sexactu.php @@ -5,7 +5,7 @@ * @description Sexactu via rss-bridge * @update 04/02/2014 */ -class LesJoiesDuCodeBridge extends BridgeAbstract{ +class SexactuBridge extends BridgeAbstract{ public function collectData(array $param){ $html = file_get_html('http://http://www.gqmagazine.fr/sexactu') or $this->returnError('Could not request http://www.gqmagazine.fr/sexactu.', 404); @@ -13,27 +13,14 @@ class LesJoiesDuCodeBridge extends BridgeAbstract{ foreach($html->find('div.content-holder ul li') as $element) { $item = new Item(); - $titreElement = $element->find('.title-holder .article-title a'); - $titre = $titreElement->innertext - $url = $titreElement->href; - - $temp = $element->find('div.text-container', 0); - $content = $temp->innertext; - - $auteur = $element->find('div.header-holder', 0); - $pos = strpos($auteur->innertext, "par"); - - if($pos > 0) - { - $auteur = trim(str_replace("*/", "", substr($auteur->innertext, ($pos + 2)))); - $item->name = $auteur; - } - - - $item->content .= trim($content); - $item->uri = $url; - $item->title = trim($titre); + // various metadata + $titleBock = $element->find('title-holder'); + $titleData = $titleBlock->find('article-title h2 a'); + $item->title = trim($titleData->innertext); + $item->uri = $titleData->href; + $item->name = "Maïa Mazaurette"; + $item->content = $element->find('text-container')->innertext; $this->items[] = $item; } } From 1644a855ee07daacdc386d10e50ce3e457d0a734 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Tue, 18 Feb 2014 11:55:47 +0100 Subject: [PATCH 05/18] Finally, a working Seactu bridge (I ahve however a bug with trailing whitespaces --- bridges/Sexactu.php | 53 ++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php index 9153afc7..a44df2b8 100644 --- a/bridges/Sexactu.php +++ b/bridges/Sexactu.php @@ -5,23 +5,40 @@ * @description Sexactu via rss-bridge * @update 04/02/2014 */ -class SexactuBridge extends BridgeAbstract{ +define("GQ", "http://www.gqmagazine.fr"); +class Sexactu extends BridgeAbstract{ public function collectData(array $param){ - $html = file_get_html('http://http://www.gqmagazine.fr/sexactu') or $this->returnError('Could not request http://www.gqmagazine.fr/sexactu.', 404); - - foreach($html->find('div.content-holder ul li') as $element) { - $item = new Item(); - - // various metadata - $titleBock = $element->find('title-holder'); - $titleData = $titleBlock->find('article-title h2 a'); - - $item->title = trim($titleData->innertext); - $item->uri = $titleData->href; - $item->name = "Maïa Mazaurette"; - $item->content = $element->find('text-container')->innertext; - $this->items[] = $item; + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + + foreach($html->find('.content-holder') as $contentHolder) { + // only use first list as second one only contains pages numbers + $articles = $contentHolder->find('ul', 0); + foreach($articles->find('li') as $element) { + // if you ask about that method_exists, there seems to be a bug in simple html dom + // see stackoverflow for more details : http://stackoverflow.com/a/10828479/15619 + if(is_object($element)) { + $item = new Item(); + // various metadata + $titleBlock = $element->find('.title-holder', 0); + if(is_object($titleBlock)) { + $titleData = $titleBlock->find('.article-title',0)->find('h2', 0)->find('a',0); + $item->title = trim($titleData->innertext); + $item->uri = GQ.$titleData->href; + + $item->name = "Maïa Mazaurette"; + $elementText = $element->find('.text-container', 0); + // don't forget to replace images server url with gq one + foreach($elementText->find('img') as $image) { + $image->src = GQ.$image->src; + } + $item->content = $elementText->innertext; + $this->items[] = $item; + } + + } + + } } } @@ -30,7 +47,7 @@ class SexactuBridge extends BridgeAbstract{ } public function getURI(){ - return 'http://http://www.gqmagazine.fr/sexactu/'; + return GQ.'/sexactu'; } public function getCacheDuration(){ @@ -40,5 +57,7 @@ class SexactuBridge extends BridgeAbstract{ return "Sexactu via rss-bridge"; } } -?> + +// what did you do Seb ? WHAT DID YOU DO ???? +// seems like bridge should not incldue php close ?> From 5f150d3ae53c9c9597c19a9181915f59eb06226e Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Tue, 18 Feb 2014 11:58:29 +0100 Subject: [PATCH 06/18] Outputting path to expected bridge should help newcomers feeling welcomed, no ? --- lib/Bridge.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Bridge.php b/lib/Bridge.php index 7b6c4751..dbff16b2 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -92,9 +92,9 @@ class Bridge{ } $pathBridge = self::getDir() . $nameBridge . '.php'; - + if( !file_exists($pathBridge) ){ - throw new \Exception('The bridge you looking for does not exist.'); + throw new \Exception('The bridge you looking for does not exist. It should be at path '.$pathBridge); } require_once $pathBridge; From f7976419ae3924d16c1eaf899a86df553c964ed6 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Thu, 20 Feb 2014 08:42:40 +0100 Subject: [PATCH 07/18] fixed title formatting --- bridges/Sexactu.php | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php index a44df2b8..20ba5f94 100644 --- a/bridges/Sexactu.php +++ b/bridges/Sexactu.php @@ -9,7 +9,10 @@ define("GQ", "http://www.gqmagazine.fr"); class Sexactu extends BridgeAbstract{ public function collectData(array $param){ - $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); +$find = array('janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', 'août', 'septembre', 'novembre', 'décembre'); +$replace = array('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'); + + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); foreach($html->find('.content-holder') as $contentHolder) { // only use first list as second one only contains pages numbers @@ -22,10 +25,19 @@ class Sexactu extends BridgeAbstract{ // various metadata $titleBlock = $element->find('.title-holder', 0); if(is_object($titleBlock)) { - $titleData = $titleBlock->find('.article-title',0)->find('h2', 0)->find('a',0); - $item->title = trim($titleData->innertext); + $titleDetails = $titleBlock->find('.article-title',0); + $titleData = $titleDetails->find('h2', 0)->find('a',0); + $titleTimestamp =$titleDetails->find('h4',0); + $item->title = $this->correctCase(trim($titleData->innertext)); $item->uri = GQ.$titleData->href; + // Fugly date parsing due to the fact my DNS-323 doesn't support php intl extension + $dateText = $titleTimestamp->innertext; + $dateText = substr($dateText, strpos($dateText,',')+1); + $dateText = str_replace($find, $replace, strtolower($dateText)); + $date = strtotime($dateText); + $item->timestamp = $date; + $item->name = "Maïa Mazaurette"; $elementText = $element->find('.text-container', 0); // don't forget to replace images server url with gq one @@ -56,8 +68,20 @@ class Sexactu extends BridgeAbstract{ public function getDescription(){ return "Sexactu via rss-bridge"; } + + public function correctCase($str) { + $sentences=explode('.', mb_strtolower($str, "UTF-8")); + $str=""; + $sep=""; + foreach ($sentences as $sentence) + { + //upper case first char + $sentence=ucfirst(trim($sentence)); + + //append sentence to output + $str=$str.$sep.$sentence; + $sep=". "; + } + return $str; + } } - -// what did you do Seb ? WHAT DID YOU DO ???? -// seems like bridge should not incldue php close ?> - From 79e4e9fdea5c56ba91970cb8f0c763de8a3adaeb Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Thu, 20 Feb 2014 08:43:14 +0100 Subject: [PATCH 08/18] added a bridge for one Liberation blog that could easily be extended for others --- bridges/Les400Culs.php | 98 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 bridges/Les400Culs.php diff --git a/bridges/Les400Culs.php b/bridges/Les400Culs.php new file mode 100644 index 00000000..40e1f69b --- /dev/null +++ b/bridges/Les400Culs.php @@ -0,0 +1,98 @@ +getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + + foreach($html->find('#alpha-inner') as $articles) { + foreach($articles->find('div.entry') as $article) { + $header = $article->find('h3.entry-header a', 0); + $content = $article->find('div.entry-content', 0); + + + $item = new Item(); + $item->title = trim($header->innertext); + $item->uri = $header->href; + $item->name = "Agnès Girard"; + // date is stored outside this node ! + $dateHeader = $article->prev_sibling(); + // http://stackoverflow.com/a/6239199/15619 (strtotime is typical amercian bullshit) + $item->timestamp = DateTime::createFromFormat('d/m/Y', $dateHeader->innertext)->getTimestamp(); + + + $linkForMore = $content->find('p.entry-more-link a',0); + if($linkForMore==null) { + $item->content = $content->innertext; + } else { + $pageAddress = $linkForMore->href; + $articlePage = str_get_html($this->get_cached($linkForMore->href)); + if($articlePage==null) { + $item->content = $content->innertext."\n

".$linkForMore->outertext."

"; + } else { + // TODO use some caching there ! + $fullContent = $articlePage->find('div.entry-content', 0); + $item->content = $fullContent->innertext; + } + } + $this->items[] = $item; + } + } + } + + public function getName(){ + return 'Les 400 Culs'; + } + + public function getURI(){ + return SEXE; + } + + public function getCacheDuration(){ + return 7200; // 2h hours + } + public function getDescription(){ + return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge"; + } + + /** + * Maintain locally cached versions of pages to download to avoid multiple doiwnloads. + * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache + * @param url url to cache + * @return content of file as string + */ + public function get_cached($url) { + $simplified_url = str_replace(["http://", "https://", "?", "&"], ["", "", "/", "/"], $url); + $filename = __DIR__ . '/../cache/'."pages/".$simplified_url; + if (substr($filename, -1) == '/') { + $filename = $filename."index.html"; + } + if(!file_exists($filename)) { + error_log("we have no local copy of ".$url." Downloading !"); + $dir = substr($filename, 0, strrpos($filename, '/')); + if(!is_dir($dir)) { + mkdir($dir, 0777, true); + } + $this->download_remote($url, $filename); + } + return file_get_contents($filename); + } + + public function download_remote($url , $save_path) { + $f = fopen( $save_path , 'w+'); + $handle = fopen($url , "rb"); + while (!feof($handle)) { + $contents = fread($handle, 8192); + fwrite($f , $contents); + } + fclose($handle); + fclose($f); + } + +} From ef0ce7d6691fc4e9d1a180df8f15d7e6b92767b1 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Thu, 20 Feb 2014 08:43:55 +0100 Subject: [PATCH 09/18] Strangely, those bridges didn't seems to work exactly as expected --- bridges/Dilbert.php | 4 ++-- bridges/LesJoiesDuCode.php | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/bridges/Dilbert.php b/bridges/Dilbert.php index 893747b6..cf741a31 100644 --- a/bridges/Dilbert.php +++ b/bridges/Dilbert.php @@ -5,7 +5,7 @@ * @description The Unofficial Dilbert Daily Comic Strip RSS Feed via rss-bridge * @update 16/10/2013 */ -class DilbertBridge extends BridgeAbstract{ +class Dilbert extends BridgeAbstract{ public function collectData(array $param){ $html = file_get_html('http://dilbert.com/strips/') or $this->returnError('Could not request Dilbert.', 404); @@ -40,4 +40,4 @@ class DilbertBridge extends BridgeAbstract{ return 14400; // 4 hours } } -?> + diff --git a/bridges/LesJoiesDuCode.php b/bridges/LesJoiesDuCode.php index a14d9f6d..c5a74b8e 100644 --- a/bridges/LesJoiesDuCode.php +++ b/bridges/LesJoiesDuCode.php @@ -5,7 +5,7 @@ * @description LesJoiesDuCode via rss-bridge * @update 30/01/2014 */ -class LesJoiesDuCodeBridge extends BridgeAbstract{ +class LesJoiesDuCode extends BridgeAbstract{ public function collectData(array $param){ $html = file_get_html('http://lesjoiesducode.fr/') or $this->returnError('Could not request LesJoiesDuCode.', 404); @@ -53,5 +53,3 @@ class LesJoiesDuCodeBridge extends BridgeAbstract{ return "Les Joies Du Code via rss-bridge"; } } -?> - From 62a5265433ecfdba17d71c3e14b02a1ec551875e Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Thu, 20 Feb 2014 12:00:50 +0100 Subject: [PATCH 10/18] updated description --- bridges/Les400Culs.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bridges/Les400Culs.php b/bridges/Les400Culs.php index 40e1f69b..aea233bb 100644 --- a/bridges/Les400Culs.php +++ b/bridges/Les400Culs.php @@ -1,9 +1,9 @@ Date: Mon, 3 Mar 2014 14:12:24 +0100 Subject: [PATCH 11/18] Prepared Gawker bridge by extracting file cache from initial Liberation bridge --- bridges/Les400Culs.php | 37 +---------------------------- lib/Bridge.php | 54 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 36 deletions(-) diff --git a/bridges/Les400Culs.php b/bridges/Les400Culs.php index aea233bb..1dd9e3e4 100644 --- a/bridges/Les400Culs.php +++ b/bridges/Les400Culs.php @@ -6,7 +6,7 @@ * @update 20/02/2014 */ define("SEXE", "http://sexes.blogs.liberation.fr"); -class Les400Culs extends BridgeAbstract{ +class Les400Culs extends HttpCachingBridgeAbstract{ public function collectData(array $param){ $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); @@ -60,39 +60,4 @@ class Les400Culs extends BridgeAbstract{ public function getDescription(){ return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge"; } - - /** - * Maintain locally cached versions of pages to download to avoid multiple doiwnloads. - * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache - * @param url url to cache - * @return content of file as string - */ - public function get_cached($url) { - $simplified_url = str_replace(["http://", "https://", "?", "&"], ["", "", "/", "/"], $url); - $filename = __DIR__ . '/../cache/'."pages/".$simplified_url; - if (substr($filename, -1) == '/') { - $filename = $filename."index.html"; - } - if(!file_exists($filename)) { - error_log("we have no local copy of ".$url." Downloading !"); - $dir = substr($filename, 0, strrpos($filename, '/')); - if(!is_dir($dir)) { - mkdir($dir, 0777, true); - } - $this->download_remote($url, $filename); - } - return file_get_contents($filename); - } - - public function download_remote($url , $save_path) { - $f = fopen( $save_path , 'w+'); - $handle = fopen($url , "rb"); - while (!feof($handle)) { - $contents = fread($handle, 8192); - fwrite($f , $contents); - } - fclose($handle); - fclose($f); - } - } diff --git a/lib/Bridge.php b/lib/Bridge.php index dbff16b2..3ef4a219 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -73,6 +73,60 @@ abstract class BridgeAbstract implements BridgeInterface{ } } +/** + * Extension of BridgeAbstract allowing caching of files downloaded over http files. + * This is specially useful for sites from Gawker or Liberation networks, which allow pages excerpts top be viewed together on index, while full pages have to be downloaded + * separately. + * This class mainly provides a get_cached method which will will download the file from its remote location. + * TODO allow file cache invalidation by touching files on access, and removing files/directories which have not been touched since ... a long time + * After all, rss-bridge is not respaw, isn't it ? + */ +abstract class HttpCachingBridgeAbstract extends BridgeAbstract { + + /** + * Maintain locally cached versions of pages to download to avoid multiple doiwnloads. + * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache + * @param url url to cache + * @return content of file as string + */ + public function get_cached($url) { + $simplified_url = str_replace(["http://", "https://", "?", "&"], ["", "", "/", "/"], $url); + // TODO build this from the variable given to Cache + $pageCacheDir = __DIR__ . '/../cache/'."pages/"; + $filename = $pageCacheDir.$simplified_url; + if (substr($filename, -1) == '/') { + $filename = $filename."index.html"; + } + if(file_exists($filename)) { + // TODO touch file and its parent, and try to do neighbour deletion + $currentPath = $filename; + while(!$pageCacheDir==$currentPath) { + touch($currentPath); + $currentPath = dirname($currentPath); + } + } else { + error_log("we have no local copy of ".$url." Downloading !"); + $dir = substr($filename, 0, strrpos($filename, '/')); + if(!is_dir($dir)) { + mkdir($dir, 0777, true); + } + $this->download_remote($url, $filename); + } + return file_get_contents($filename); + } + + public function download_remote($url , $save_path) { + $f = fopen( $save_path , 'w+'); + $handle = fopen($url , "rb"); + while (!feof($handle)) { + $contents = fread($handle, 8192); + fwrite($f , $contents); + } + fclose($handle); + fclose($f); + } +} + class Bridge{ static protected $dirBridge; From fda3e9886e2a3dcd718128799fff7b288cdfe5c9 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Wed, 26 Mar 2014 11:16:31 +0100 Subject: [PATCH 12/18] Allowed cache disabling to be performed on a per-query basis (very useful to add bridges to an existing and running installation of rss-bridge) --- index.php | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/index.php b/index.php index a95fcbb4..101e0148 100644 --- a/index.php +++ b/index.php @@ -39,9 +39,11 @@ try{ // Data retrieval $bridge = Bridge::create($bridge); - $bridge - ->setCache($cache) // Comment this lign for avoid cache use - ->setDatas($_REQUEST); + if(isset($_REQUEST["disable_cache"])) { + } else { + $bridge->setCache($cache); // just add disable cache to your query to disable caching + } + $bridge->setDatas($_REQUEST); // Data transformation $format = Format::create($format); From 954bc4dde0f351eaf54f2833aa0aee9ede2ed48b Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Wed, 26 Mar 2014 11:20:24 +0100 Subject: [PATCH 13/18] Improved caching behaviour and allowed subclasses to easily use logging 'infrastructure' --- lib/Bridge.php | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/Bridge.php b/lib/Bridge.php index 3ef4a219..75f470eb 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -90,7 +90,7 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract { * @return content of file as string */ public function get_cached($url) { - $simplified_url = str_replace(["http://", "https://", "?", "&"], ["", "", "/", "/"], $url); + $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url); // TODO build this from the variable given to Cache $pageCacheDir = __DIR__ . '/../cache/'."pages/"; $filename = $pageCacheDir.$simplified_url; @@ -98,6 +98,7 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract { $filename = $filename."index.html"; } if(file_exists($filename)) { + // $this->message("loading cached file from ".$filename." for page at url ".$url); // TODO touch file and its parent, and try to do neighbour deletion $currentPath = $filename; while(!$pageCacheDir==$currentPath) { @@ -105,7 +106,7 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract { $currentPath = dirname($currentPath); } } else { - error_log("we have no local copy of ".$url." Downloading !"); + // $this->message("we have no local copy of ".$url." Downloading !"); $dir = substr($filename, 0, strrpos($filename, '/')); if(!is_dir($dir)) { mkdir($dir, 0777, true); @@ -125,6 +126,15 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract { fclose($handle); fclose($f); } + + public function message($text) { + $backtrace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 3); + $calling = $backtrace[2]; + $message = $calling["file"].":".$calling["line"] + ." class ".get_class($this)."->".$calling["function"] + ." - ".$text; + error_log($message); + } } class Bridge{ From e564559bdafbb1cfe01849d0932182a31bf6cfdd Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Wed, 26 Mar 2014 11:22:53 +0100 Subject: [PATCH 14/18] Fixes #42 by adding a working bridge, complete with section specific feeds --- bridges/Freenews.php | 69 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 bridges/Freenews.php diff --git a/bridges/Freenews.php b/bridges/Freenews.php new file mode 100644 index 00000000..dad8eb6d --- /dev/null +++ b/bridges/Freenews.php @@ -0,0 +1,69 @@ +uri = RUBRIQUE.$param['id']; + } + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); +// $this->message("loaded HTML from ".$this->getURI()); + // customize name + $this->name = $html->find('title', 0)->innertext; + foreach($html->find('.news_line') as $newsLines) { + $this->parseLine($newsLines); + } + } + + public function parseLine($newsLines) { + foreach($newsLines->find('span') as $newsSpan) { + foreach($newsSpan->find('a') as $newsLink) { + $item = new Item(); + $item->title = trim($newsLink->title); + $item->uri = FREENEWS.$newsLink->href; + // now load that uri from cache + $articlePage = str_get_html($this->get_cached($item->uri)); + $content = $articlePage->find('.chapo', 0); + foreach($content->find('img') as $image) { + $image->src = FREENEWS.$image->src; + } + $redaction = $articlePage->find('.redac', 0); + $rubrique = $redaction->find('a', 0); + $auteur = $redaction->find('a', 1); + $item->content = $content->innertext; + $item->name = $auteur->innertext; + // format should parse 2014-03-25T16:21:20Z. But, according to http://stackoverflow.com/a/10478469, it is not that simple + $item->timestamp = DateTime::createFromFormat('Y-m-d\TH:i:s+', $redaction->title)->getTimestamp(); + $this->items[] = $item; + // return after first link, as there are hidden treasures in those pages + return; + } + } + } + + public function getName(){ + return $this->name; + } + + public function getURI(){ + return $this->uri; + } + + public function getCacheDuration(){ + return 3600; // 2h hours + } + public function getDescription(){ + return "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). par rss-bridge"; + } +} From 0fa32c7cf9d3ab947766c986119372f737769654 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Wed, 26 Mar 2014 16:04:10 +0100 Subject: [PATCH 15/18] added a simple method to allow all subclasses to easily relocate image links --- lib/Bridge.php | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lib/Bridge.php b/lib/Bridge.php index 75f470eb..4467da0b 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -71,6 +71,17 @@ abstract class BridgeAbstract implements BridgeInterface{ return $this; } + + /** + * Set default image SRC attribute to point on given server when none is provided (that's to say when image src starts with '/' + */ + public function defaultImageSrcTo($content, $server) { + foreach($content->find('img') as $image) { + if(strpos($image->src, '/')==0) { + $image->src = $server.$image->src; + } + } + } } /** From 1a12f48e2e872b84212a4273acc86d37073a683a Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Wed, 26 Mar 2014 16:22:31 +0100 Subject: [PATCH 16/18] Working WorldOfTanks official news bridge, complete with images and categories --- bridges/WorldOfTanks.php | 63 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 bridges/WorldOfTanks.php diff --git a/bridges/WorldOfTanks.php b/bridges/WorldOfTanks.php new file mode 100644 index 00000000..1a6e6257 --- /dev/null +++ b/bridges/WorldOfTanks.php @@ -0,0 +1,63 @@ +lang = $param['lang']; + } + if(empty($param['category'])) { + $this->uri = WORLD_OF_TANKS.$this->lang.NEWS; + } else { + $this->uri = WORLD_OF_TANKS.$this->lang.NEWS.$param['category']."/"; + } + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + $this->message("loaded HTML from ".$this->getURI()); + // customize name + $this->name = $html->find('title', 0)->innertext; + foreach($html->find('.b-imgblock_ico') as $infoLink) { + $this->parseLine($infoLink); + } + } + + public function parseLine($infoLink) { + $item = new Item(); + $item->uri = WORLD_OF_TANKS.$infoLink->href; + // now load that uri from cache +// $this->message("loading page ".$item->uri); + $articlePage = str_get_html($this->get_cached($item->uri)); + $content = $articlePage->find('.l-content', 0); + $this->defaultImageSrcTo($content, WORLD_OF_TANKS); + $item->title = $content->find('h1', 0)->innertext; + $item->content = $content->find('.b-content', 0)->innertext; +// $item->name = $auteur->innertext; + $item->timestamp = $content->find('.b-statistic_time', 0)->getAttribute("data-timestamp"); + $this->items[] = $item; + } + + public function getName(){ + return $this->name; + } + + public function getURI(){ + return $this->uri; + } + + public function getCacheDuration(){ + return 3600; // 2h hours + } + public function getDescription(){ + return "Toutes les actualités les plus brulantes de ce simulateur de destruction d'acier."; + } +} From fd71ceae821a1e6b597a252e16c86bb682e72570 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Wed, 2 Apr 2014 10:55:42 +0100 Subject: [PATCH 17/18] Adding a meta-bridge for all Gawker publications (lifehacker, kotaku, you name it, ...) --- bridges/Gawker.php | 89 ++++++++++++++++++++++++++++++++++++++++++++++ lib/Bridge.php | 47 +++++++++++++++++------- 2 files changed, 123 insertions(+), 13 deletions(-) create mode 100644 bridges/Gawker.php diff --git a/bridges/Gawker.php b/bridges/Gawker.php new file mode 100644 index 00000000..4d135220 --- /dev/null +++ b/bridges/Gawker.php @@ -0,0 +1,89 @@ +uri = $param['site']; + } + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + $this->message("loaded HTML from ".$this->getURI()); + // customize name + $this->name = $html->find('title', 0)->innertext; + foreach($html->find('.main-column') as $content) { + $this->parseContent($content); + } + } + + public function parseContent($content) { + foreach($content->find('.headline') as $headline) { + foreach($headline->find('a') as $articleLink) { + // notice we only use article from this gawker site (as gawker like to see us visit other sites) + if(strpos($articleLink->href, $this->getURI())>=0) { + $this->parseLink($articleLink); + } + } + } + } + + public function parseLink($infoLink) { + $item = new Item(); + $item->uri = $infoLink->href; + $item->title = $infoLink->innertext; + try { + // now load that uri from cache +// $this->message("loading page ".$item->uri); + $articlePage = str_get_html($this->get_cached($item->uri)); + if(is_object($articlePage)) { + $content = $articlePage->find('.post-content', 0); + $this->defaultImageSrcTo($content, $this->getURI()); + $item->content = $content->innertext; + // http://stackoverflow.com/q/22715928/15619 + $publishtime = $articlePage->find('.publish-time', 0)->getAttribute("data-publishtime"); + // don't know what I'm doing there, but http://www.epochconverter.com/programming/functions-php.php#epoch2date recommends it + $item->timestamp = $this->js_to_unix_timestamp($publishtime); + $vcard = $articlePage->find('.vcard', 0); + if(is_object($vcard)) { + $item->name = $vcard->find('a', 0)->innertext; + } + } else { + throw new Exception("cache content for ".$item->uri." is NOT a Simple DOM parser object !"); + } + } catch(Exception $e) { + $this->message("obtaining ".$item->uri." resulted in exception ".$e->getMessage().". Deleting cached page ..."); + // maybe file is incorrect. it should be discarded from cache + $this->remove_from_cache($item->url); + $item->content = $e->getMessage(); + } + $this->items[] = $item; + } + + function js_to_unix_timestamp($jsTimestamp){ + return $jsTimestamp/1000; + } + + public function getName(){ + return $this->name; + } + + public function getURI(){ + return $this->uri; + } + + public function getCacheDuration(){ + return 3600; // 1h + } + public function getDescription(){ + return "Gawker press blog content."; + } +} diff --git a/lib/Bridge.php b/lib/Bridge.php index 4467da0b..aeaba6f7 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -109,33 +109,54 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract { $filename = $filename."index.html"; } if(file_exists($filename)) { - // $this->message("loading cached file from ".$filename." for page at url ".$url); +// $this->message("loading cached file from ".$filename." for page at url ".$url); // TODO touch file and its parent, and try to do neighbour deletion - $currentPath = $filename; - while(!$pageCacheDir==$currentPath) { - touch($currentPath); - $currentPath = dirname($currentPath); - } + $this->refresh_in_cache($pageCacheDir, $filename); } else { - // $this->message("we have no local copy of ".$url." Downloading !"); +// $this->message("we have no local copy of ".$url." Downloading to ".$filename); $dir = substr($filename, 0, strrpos($filename, '/')); if(!is_dir($dir)) { +// $this->message("creating directories for ".$dir); mkdir($dir, 0777, true); } $this->download_remote($url, $filename); } return file_get_contents($filename); } + + private function refresh_in_cache($pageCacheDir, $filename) { + $currentPath = $filename; + while(!$pageCacheDir==$currentPath) { + touch($currentPath); + $currentPath = dirname($currentPath); + } + } public function download_remote($url , $save_path) { $f = fopen( $save_path , 'w+'); - $handle = fopen($url , "rb"); - while (!feof($handle)) { - $contents = fread($handle, 8192); - fwrite($f , $contents); + if($f) { + $handle = fopen($url , "rb"); + if($handle) { + while (!feof($handle)) { + $contents = fread($handle, 8192); + if($contents) { + fwrite($f , $contents); + } + } + fclose($handle); + } + fclose($f); } - fclose($handle); - fclose($f); + } + + public function remove_from_cache($url) { + $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url); + // TODO build this from the variable given to Cache + $pageCacheDir = __DIR__ . '/../cache/'."pages/"; + $filename = realpath($pageCacheDir.$simplified_url); + $this->message("removing from cache \"".$filename."\" WELL, NOT REALLY"); + // filename is NO GOOD +// unlink($filename); } public function message($text) { From ce28cfad221329f645f328379b88f46995b395ec Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Wed, 18 Mar 2015 17:42:55 +0100 Subject: [PATCH 18/18] Rewritten those bridge to use a new mechanism ! RSS expander will get data from the RSS feed and expand the truncated entries to their full values. --- bridges/Freenews.php | 82 +++++++++++++---------------------------- bridges/Gawker.php | 79 +++++++++++++-------------------------- bridges/Les400Culs.php | 73 ++++++++++++++---------------------- bridges/RssExpander.php | 68 ++++++++++++++++++++++++++++++++++ bridges/Sexactu.php | 2 +- 5 files changed, 147 insertions(+), 157 deletions(-) create mode 100644 bridges/RssExpander.php diff --git a/bridges/Freenews.php b/bridges/Freenews.php index dad8eb6d..caaf7694 100644 --- a/bridges/Freenews.php +++ b/bridges/Freenews.php @@ -1,69 +1,37 @@ uri = RUBRIQUE.$param['id']; - } - $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); -// $this->message("loaded HTML from ".$this->getURI()); - // customize name - $this->name = $html->find('title', 0)->innertext; - foreach($html->find('.news_line') as $newsLines) { - $this->parseLine($newsLines); - } + $param['url'] = RSS; + parent::collectData($param); } - public function parseLine($newsLines) { - foreach($newsLines->find('span') as $newsSpan) { - foreach($newsSpan->find('a') as $newsLink) { - $item = new Item(); - $item->title = trim($newsLink->title); - $item->uri = FREENEWS.$newsLink->href; - // now load that uri from cache - $articlePage = str_get_html($this->get_cached($item->uri)); - $content = $articlePage->find('.chapo', 0); - foreach($content->find('img') as $image) { - $image->src = FREENEWS.$image->src; - } - $redaction = $articlePage->find('.redac', 0); - $rubrique = $redaction->find('a', 0); - $auteur = $redaction->find('a', 1); - $item->content = $content->innertext; - $item->name = $auteur->innertext; - // format should parse 2014-03-25T16:21:20Z. But, according to http://stackoverflow.com/a/10478469, it is not that simple - $item->timestamp = DateTime::createFromFormat('Y-m-d\TH:i:s+', $redaction->title)->getTimestamp(); - $this->items[] = $item; - // return after first link, as there are hidden treasures in those pages - return; - } - } - } + protected function parseRSSItem($newsItem) { + $item = new Item(); + $item->title = trim($newsItem->title); +// $this->message("item has for title \"".$item->title."\""); + if(empty($newsItem->guid)) { + $item->uri = $newsItem->link; + } else { + $item->uri = $newsItem->guid; + } + // now load that uri from cache +// $this->message("now loading page ".$item->uri); + $articlePage = str_get_html($this->get_cached($item->uri)); - public function getName(){ - return $this->name; - } - - public function getURI(){ - return $this->uri; - } - - public function getCacheDuration(){ - return 3600; // 2h hours - } - public function getDescription(){ - return "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). par rss-bridge"; + $content = $articlePage->find('.post-container', 0); + $item->content = $content->innertext; + $item->name = $articlePage->find('a[rel=author]', 0)->innertext; + // format should parse 2014-03-25T16:21:20Z. But, according to http://stackoverflow.com/a/10478469, it is not that simple + $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem); + return $item; } } diff --git a/bridges/Gawker.php b/bridges/Gawker.php index 4d135220..f8b484ff 100644 --- a/bridges/Gawker.php +++ b/bridges/Gawker.php @@ -2,44 +2,36 @@ /** * * @name Gawker media -* @description A bridge allowing access to any of the numerous Gawker media blogs (Lifehacker, deadspin, Kotaku, Jezebel, and so on +* @description A bridge allowing access to any of the numerous Gawker media blogs (Lifehacker, deadspin, Kotaku, Jezebel, and so on. Notice you have to give its id to find the RSS stream in gawker maze * @update 27/03/2014 -* @use1(site="site") +* @use1(site="site id to put in uri between feeds.gawker.com and /full .. which is obviously not full AT ALL") */ -class Gawker extends HttpCachingBridgeAbstract{ - private $uri; - private $name; +require_once 'bridges/RssExpander.php'; +define("RSS_PREFIX", "http://feeds.gawker.com/"); +define("RSS_SUFFIX", "/full"); +class Gawker extends RssExpander{ + + private function toURI($name) { + return RSS_PREFIX.$name.RSS_SUFFIX; + } public function collectData(array $param){ if (empty($param['site'])) { trigger_error("If no site is provided, nothing is gonna happen", E_USER_ERROR); } else { - $this->uri = $param['site']; + $this->name = $param['site']; + $param['url'] = $this->toURI(strtolower($param['site'])); } - $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); - $this->message("loaded HTML from ".$this->getURI()); - // customize name - $this->name = $html->find('title', 0)->innertext; - foreach($html->find('.main-column') as $content) { - $this->parseContent($content); - } +// $this->message("loading feed from ".$this->getURI()); + parent::collectData($param); } - - public function parseContent($content) { - foreach($content->find('.headline') as $headline) { - foreach($headline->find('a') as $articleLink) { - // notice we only use article from this gawker site (as gawker like to see us visit other sites) - if(strpos($articleLink->href, $this->getURI())>=0) { - $this->parseLink($articleLink); - } - } - } - } - public function parseLink($infoLink) { + protected function parseRSSItem($newsItem) { $item = new Item(); - $item->uri = $infoLink->href; - $item->title = $infoLink->innertext; + $item->uri = trim($newsItem->link); + $item->title = trim($newsItem->title); + $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem); +// $this->message("///////////////////////////////////////////////////////////////////////////////////////\nprocessing item ".var_export($item, true)."\n\n\nbuilt from\n\n\n".var_export($newsItem, true)); try { // now load that uri from cache // $this->message("loading page ".$item->uri); @@ -47,15 +39,15 @@ class Gawker extends HttpCachingBridgeAbstract{ if(is_object($articlePage)) { $content = $articlePage->find('.post-content', 0); $this->defaultImageSrcTo($content, $this->getURI()); - $item->content = $content->innertext; - // http://stackoverflow.com/q/22715928/15619 - $publishtime = $articlePage->find('.publish-time', 0)->getAttribute("data-publishtime"); - // don't know what I'm doing there, but http://www.epochconverter.com/programming/functions-php.php#epoch2date recommends it - $item->timestamp = $this->js_to_unix_timestamp($publishtime); $vcard = $articlePage->find('.vcard', 0); if(is_object($vcard)) { - $item->name = $vcard->find('a', 0)->innertext; + $authorLink = $vcard->find('a', 0); + $item->name = $authorLink->innertext; + // TODO use author link href to fill the feed info } +// $this->message("item quite loaded : ".var_export($item, true)); + // I set item content as last element, for easier var_export reading + $item->content = $content->innertext; } else { throw new Exception("cache content for ".$item->uri." is NOT a Simple DOM parser object !"); } @@ -65,25 +57,6 @@ class Gawker extends HttpCachingBridgeAbstract{ $this->remove_from_cache($item->url); $item->content = $e->getMessage(); } - $this->items[] = $item; - } - - function js_to_unix_timestamp($jsTimestamp){ - return $jsTimestamp/1000; - } - - public function getName(){ - return $this->name; - } - - public function getURI(){ - return $this->uri; - } - - public function getCacheDuration(){ - return 3600; // 1h - } - public function getDescription(){ - return "Gawker press blog content."; + return $item; } } diff --git a/bridges/Les400Culs.php b/bridges/Les400Culs.php index 1dd9e3e4..e6dee731 100644 --- a/bridges/Les400Culs.php +++ b/bridges/Les400Culs.php @@ -5,59 +5,40 @@ * @description La planète sexe vue par Agnès Girard via rss-bridge * @update 20/02/2014 */ +require_once 'bridges/RssExpander.php'; define("SEXE", "http://sexes.blogs.liberation.fr"); -class Les400Culs extends HttpCachingBridgeAbstract{ +define("RSS", "http://sexes.blogs.liberation.fr/feeds/"); +/** + * As it seems that Les 400 culs currently offer a full feed, we won't change it content here. + * But I'm ready for the day where it will ... again ... provide some truncated content + */ +class Les400Culs extends RssExpander{ public function collectData(array $param){ - $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); - - foreach($html->find('#alpha-inner') as $articles) { - foreach($articles->find('div.entry') as $article) { - $header = $article->find('h3.entry-header a', 0); - $content = $article->find('div.entry-content', 0); - - - $item = new Item(); - $item->title = trim($header->innertext); - $item->uri = $header->href; - $item->name = "Agnès Girard"; - // date is stored outside this node ! - $dateHeader = $article->prev_sibling(); - // http://stackoverflow.com/a/6239199/15619 (strtotime is typical amercian bullshit) - $item->timestamp = DateTime::createFromFormat('d/m/Y', $dateHeader->innertext)->getTimestamp(); - - - $linkForMore = $content->find('p.entry-more-link a',0); - if($linkForMore==null) { - $item->content = $content->innertext; - } else { - $pageAddress = $linkForMore->href; - $articlePage = str_get_html($this->get_cached($linkForMore->href)); - if($articlePage==null) { - $item->content = $content->innertext."\n

".$linkForMore->outertext."

"; - } else { - // TODO use some caching there ! - $fullContent = $articlePage->find('div.entry-content', 0); - $item->content = $fullContent->innertext; - } - } - $this->items[] = $item; - } - } + $param['url'] = RSS; + parent::collectData($param); } + + protected function parseRSSItem($newsItem) { + $item = new Item(); + $item->title = trim($newsItem->title); +// $this->message("browsing item ".var_export($newsItem, true)); + if(empty($newsItem->guid)) { + $item->uri = $newsItem->link; + } else { + $item->uri = $newsItem->guid; + } + // now load that uri from cache +// $this->message("now loading page ".$item->uri); +// $articlePage = str_get_html($this->get_cached($item->uri)); - public function getName(){ - return 'Les 400 Culs'; +// $content = $articlePage->find('.post-container', 0); + $item->content = $newsItem->description; + $item->name = $newsItem->author; + $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem); + return $item; } - - public function getURI(){ - return SEXE; - } - public function getCacheDuration(){ return 7200; // 2h hours } - public function getDescription(){ - return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge"; - } } diff --git a/bridges/RssExpander.php b/bridges/RssExpander.php new file mode 100644 index 00000000..07268ea1 --- /dev/null +++ b/bridges/RssExpander.php @@ -0,0 +1,68 @@ +returnError('There is no $param[\'url\'] for this RSS expander', 404); + } + // $this->message("Loading from ".$param['url']); + // Notice WE DO NOT use cache here on purpose : we want a fresh view of the RSS stream each time + $rssContent = simplexml_load_file($param['url']) or $this->returnError('Could not request '.$param['url'], 404); +// $this->message("loaded RSS from ".$param['url']); + // TODO insert RSS format detection + // we suppose for now, we have some RSS 2.0 + $this->collect_RSS_2_0_data($rssContent); + } + + private function collect_RSS_2_0_data($rssContent) { + $rssContent = $rssContent->channel[0]; +// $this->message("RSS content is ===========\n".var_export($rssContent, true)."==========="); + $this->load_RSS_2_0_feed_data($rssContent); + foreach($rssContent->item as $item) { +// $this->message("parsing item ".var_export($item, true)); + $this->items[] = $this->parseRSSItem($item); + } + } + + protected function RSS_2_0_time_to_timestamp($item) { + return DateTime::createFromFormat('D, d M Y H:i:s e', $item->pubDate)->getTimestamp(); + } + + // TODO set title, link, description, language, and so on + protected function load_RSS_2_0_feed_data($rssContent) { + $this->name = trim($rssContent->title); + $this->uri = trim($rssContent->link); + $this->description = trim($rssContent->description); + } + + /** + * Method should return, from a source RSS item given by lastRSS, one of our Items objects + * @param $item the input rss item + * @return a RSS-Bridge Item, with (hopefully) the whole content) + */ + abstract protected function parseRSSItem($item); + + + public function getName(){ + return $this->name; + } + + public function getURI(){ + return $this->uri; + } + + public function getDescription() { + return $this->description; + } +} \ No newline at end of file diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php index 20ba5f94..edc173fe 100644 --- a/bridges/Sexactu.php +++ b/bridges/Sexactu.php @@ -1,6 +1,6 @@