Merge pull request #121 from Riduidel/master

Création du RssExpander et utilisation dans Gawker et Sexactu
2015-04-24 17:30:39 +02:00 · 2015-04-24 17:30:39 +02:00 · f1d74a4a27
parent 4e95599d8d 0f6fdb0c5b
commit f1d74a4a27
10 changed files with 478 additions and 12 deletions
--- a/bridges/Dilbert.php
+++ b/bridges/Dilbert.php
@ -0,0 +1,43 @@
+<?php
+/**
+*
+* @name Dilbert Daily Strip 
+* @description The Unofficial Dilbert Daily Comic Strip RSS Feed via rss-bridge
+* @update 16/10/2013
+*/
+class Dilbert extends BridgeAbstract{
+
+    public function collectData(array $param){
+        $html = file_get_html('http://dilbert.com/strips/') or $this->returnError('Could not request Dilbert.', 404);
+    
+        foreach($html->find('div.STR_Image') as $element) {
+            $item = new Item();
+            $href = $element->find('a',0)->href;
+            $item->uri = 'http://dilbert.com' . $href;
+            $content = str_replace('src="/', 'src="http://dilbert.com/',$element->innertext);
+            $content = str_replace('href="/', 'href="http://dilbert.com/',$content);
+            $item->content = $content;
+            $time = strtotime(substr($href, (strrpos($href, "/", -10) + 1), 10));
+            $item->title = date("d/m/Y", $time);
+            $item->timestamp = $time;
+            $this->items[] = $item;
+        }
+    }
+
+    public function getName(){
+        return 'Dilbert';
+    }
+
+    public function getURI(){
+        return 'http://dilbert.com';
+    }
+
+    public function getDescription(){
+        return 'Dilbert via rss-bridge';
+    }
+
+    public function getCacheDuration(){
+        return 14400; // 4 hours
+    }
+}
+
--- a/bridges/Freenews.php
+++ b/bridges/Freenews.php
@ -0,0 +1,37 @@
+<?php
+/**
+*
+* @name Freenews
+* @description Un site d'actualitÃ© pour les freenautes (mais ne parlant pas que de la freebox). Ne rentrez pas d'id si vous voulez accÃ©der aux actualitÃ©s gÃ©nÃ©rales.
+* @update 26/03/2014
+* @use1(id="Id de la rubrique (sans le '-')")
+*/
+require_once 'bridges/RssExpander.php';
+define("RSS", 'http://feeds.feedburner.com/Freenews-Freebox?format=xml');
+class Freenews extends RssExpander {
+    public function collectData(array $param){
+        $param['url'] = RSS;
+        parent::collectData($param);
+    }
+    
+    protected function parseRSSItem($newsItem) {
+        $item = new Item();
+        $item->title = trim($newsItem->title);
+//        $this->message("item has for title \"".$item->title."\"");
+        if(empty($newsItem->guid)) {
+            $item->uri = $newsItem->link;
+        } else {
+            $item->uri = $newsItem->guid;
+        }
+        // now load that uri from cache
+//        $this->message("now loading page ".$item->uri);
+        $articlePage = str_get_html($this->get_cached($item->uri));
+
+        $content = $articlePage->find('.post-container', 0);
+        $item->content = $content->innertext;
+        $item->name = $articlePage->find('a[rel=author]', 0)->innertext;
+        // format should parse 2014-03-25T16:21:20Z. But, according to http://stackoverflow.com/a/10478469, it is not that simple
+        $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem);
+        return $item;
+    }
+}
--- a/bridges/Gawker.php
+++ b/bridges/Gawker.php
@ -0,0 +1,62 @@
+<?php
+/**
+*
+* @name Gawker media
+* @description A bridge allowing access to any of the numerous Gawker media blogs (Lifehacker, deadspin, Kotaku, Jezebel, and so on. Notice you have to give its id to find the RSS stream in gawker maze
+* @update 27/03/2014
+* @use1(site="site id to put in uri between feeds.gawker.com and /full .. which is obviously not full AT ALL")
+*/
+require_once 'bridges/RssExpander.php';
+define("RSS_PREFIX", "http://feeds.gawker.com/");
+define("RSS_SUFFIX", "/full");
+class Gawker extends RssExpander{
+    
+    private function toURI($name) {
+        return RSS_PREFIX.$name.RSS_SUFFIX;
+    }
+
+    public function collectData(array $param){
+        if (empty($param['site'])) {
+			trigger_error("If no site is provided, nothing is gonna happen", E_USER_ERROR);
+        } else {
+            $this->name = $param['site'];
+			$param['url'] = $this->toURI(strtolower($param['site']));
+        }
+//        $this->message("loading feed from ".$this->getURI());
+        parent::collectData($param);
+    }
+    
+    protected function parseRSSItem($newsItem) {
+        $item = new Item();
+        $item->uri = trim($newsItem->link);
+        $item->title = trim($newsItem->title);
+        $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem);
+//        $this->message("///////////////////////////////////////////////////////////////////////////////////////\nprocessing item ".var_export($item, true)."\n\n\nbuilt from\n\n\n".var_export($newsItem, true));
+        try {
+            // now load that uri from cache
+//            $this->message("loading page ".$item->uri);
+            $articlePage = str_get_html($this->get_cached($item->uri));
+            if(is_object($articlePage)) {
+                $content = $articlePage->find('.post-content', 0);
+                $this->defaultImageSrcTo($content, $this->getURI());
+                $vcard = $articlePage->find('.vcard', 0);
+                if(is_object($vcard)) {
+                    $authorLink = $vcard->find('a', 0);
+    				$item->name = $authorLink->innertext;
+                    // TODO use author link href to fill the feed info
+                }
+//                $this->message("item quite loaded : ".var_export($item, true));
+                // I set item content as last element, for easier var_export reading
+                $item->content = $content->innertext;
+            } else {
+                throw new Exception("cache content for ".$item->uri." is NOT a Simple DOM parser object !");
+            }
+        } catch(Exception $e) {
+            $this->message("obtaining ".$item->uri." resulted in exception ".$e->getMessage().". Deleting cached page ...");
+            // maybe file is incorrect. it should be discarded from cache
+            $this->remove_from_cache($item->url);
+            $item->content = $e->getMessage();
+        }
+        return $item;
+    }
+}
--- a/bridges/Les400Culs.php
+++ b/bridges/Les400Culs.php
@ -0,0 +1,44 @@
+<?php
+/**
+*
+* @name Les 400 Culs 
+* @description La planète sexe vue par Agnès Girard via rss-bridge
+* @update 20/02/2014
+*/
+require_once 'bridges/RssExpander.php';
+define("SEXE", "http://sexes.blogs.liberation.fr");
+define("RSS", "http://sexes.blogs.liberation.fr/feeds/");
+/**
+ * As it seems that Les 400 culs currently offer a full feed, we won't change it content here.
+ * But I'm ready for the day where it will ... again ... provide some truncated content
+ */
+class Les400Culs extends RssExpander{
+
+    public function collectData(array $param){
+        $param['url'] = RSS;
+        parent::collectData($param);
+    }
+    
+    protected function parseRSSItem($newsItem) {
+        $item = new Item();
+        $item->title = trim($newsItem->title);
+//        $this->message("browsing item ".var_export($newsItem, true));
+        if(empty($newsItem->guid)) {
+            $item->uri = $newsItem->link;
+        } else {
+            $item->uri = $newsItem->guid;
+        }
+        // now load that uri from cache
+//        $this->message("now loading page ".$item->uri);
+//        $articlePage = str_get_html($this->get_cached($item->uri));
+
+//        $content = $articlePage->find('.post-container', 0);
+        $item->content = $newsItem->description;
+        $item->name = $newsItem->author;
+        $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem);
+        return $item;
+    }
+    public function getCacheDuration(){
+        return 7200; // 2h hours
+    }
+}
--- a/bridges/LesJoiesDuCode.php
+++ b/bridges/LesJoiesDuCode.php
@ -0,0 +1,55 @@
+<?php
+/**
+*
+* @name Les Joies Du Code
+* @description LesJoiesDuCode via rss-bridge
+* @update 30/01/2014
+*/
+class LesJoiesDuCode extends BridgeAbstract{
+
+    public function collectData(array $param){
+        $html = file_get_html('http://lesjoiesducode.fr/') or $this->returnError('Could not request LesJoiesDuCode.', 404);
+    
+        foreach($html->find('div.post') as $element) {
+            $item = new Item();
+            $temp = $element->find('h3 a', 0);
+            
+            $titre = $temp->innertext;
+            $url = $temp->href;
+            
+            $temp = $element->find('div.bodytype', 0);
+            $content = $temp->innertext;
+            
+            $auteur = $temp->find('.c1 em', 0);
+            $pos = strpos($auteur->innertext, "by");
+            
+            if($pos > 0)
+            {
+                $auteur = trim(str_replace("*/", "", substr($auteur->innertext, ($pos + 2))));
+                $item->name = $auteur;
+            }
+            
+            
+            $item->content .= trim($content);
+            $item->uri = $url;
+            $item->title = trim($titre);
+            
+            $this->items[] = $item;
+        }
+    }
+
+    public function getName(){
+        return 'Les Joies Du Code';
+    }
+
+    public function getURI(){
+        return 'http://lesjoiesducode.fr/';
+    }
+
+    public function getCacheDuration(){
+        return 7200; // 2h hours
+    }
+    public function getDescription(){
+        return "Les Joies Du Code via rss-bridge";
+    }
+}
--- a/bridges/RssExpander.php
+++ b/bridges/RssExpander.php
@ -0,0 +1,68 @@
+<?php
+
+/**
+ * A class providing facilities for RSS expansion. The goal here is to facilitate, as much as possible, writing bridges such as FreeNews, Gawker and other ones 
+ * @name RssExpander 
+ * @description Un bridge générique d'expansion automatique de contenu RSS ... pour tous ces sites qui ont un flux RSS mochement tonqué.
+ * @update 15/03/2015
+ * @use1(url="URL du flux dont vous souhaitez le contenu complet")
+ */
+ 
+abstract class RssExpander extends HttpCachingBridgeAbstract{
+    protected $name;
+    private $uri;
+    private $description;
+    public function collectData(array $param){
+        if (empty($param['url'])) {
+            $this->returnError('There is no $param[\'url\'] for this RSS expander', 404);
+        }
+ //       $this->message("Loading from ".$param['url']);
+        // Notice WE DO NOT use cache here on purpose : we want a fresh view of the RSS stream each time
+        $rssContent = simplexml_load_file($param['url']) or $this->returnError('Could not request '.$param['url'], 404);
+//        $this->message("loaded RSS from ".$param['url']);
+        // TODO insert RSS format detection
+        // we suppose for now, we have some RSS 2.0
+        $this->collect_RSS_2_0_data($rssContent);
+    }
+    
+    private function collect_RSS_2_0_data($rssContent) {
+        $rssContent = $rssContent->channel[0];
+//        $this->message("RSS content is ===========\n".var_export($rssContent, true)."===========");
+        $this->load_RSS_2_0_feed_data($rssContent);
+        foreach($rssContent->item as $item) {
+//            $this->message("parsing item ".var_export($item, true));
+            $this->items[] = $this->parseRSSItem($item);
+        }
+    }
+    
+    protected function RSS_2_0_time_to_timestamp($item)  {
+        return DateTime::createFromFormat('D, d M Y H:i:s e', $item->pubDate)->getTimestamp();
+    }
+    
+    // TODO set title, link, description, language, and so on
+    protected function load_RSS_2_0_feed_data($rssContent) {
+        $this->name = trim($rssContent->title);
+        $this->uri = trim($rssContent->link);
+        $this->description = trim($rssContent->description);
+    }
+    
+    /**
+     * Method should return, from a source RSS item given by lastRSS, one of our Items objects
+     * @param $item the input rss item
+     * @return a RSS-Bridge Item, with (hopefully) the whole content)
+     */
+    abstract protected function parseRSSItem($item);
+
+    
+    public function getName(){
+        return $this->name;
+    }
+
+    public function getURI(){
+        return $this->uri;
+    }
+    
+    public function getDescription() {
+        return $this->description;
+    }
+}
--- a/bridges/Sexactu.php
+++ b/bridges/Sexactu.php
@ -1,11 +1,8 @@
 <?php
 /**
-* 2014-05-26
-*
+* Unfortunatly, Sexactu do not provide a RSS stream, which prevents me from upgrading this to the mugnificent RssExpander framework
 * @name Sexactu
-* @homepage http://www.gqmagazine.fr/sexactu
-* @description Sexactu
-* @maintainer Riduidel
+* @description Sexactu via rss-bridge
 * @update 04/02/2014
 */
 define("GQ", "http://www.gqmagazine.fr");
@ -69,7 +66,7 @@ $replace = array('January', 'February', 'March', 'April', 'May', 'June', 'July',
        return 7200; // 2h hours
    }
    public function getDescription(){
-        return "Sexactu";
+        return "Sexactu via rss-bridge";
    }
    
    public function correctCase($str) {
@ -88,4 +85,3 @@ $replace = array('January', 'February', 'March', 'April', 'May', 'June', 'July',
        return $str;
    }
 }
-
--- a/bridges/WorldOfTanks.php
+++ b/bridges/WorldOfTanks.php
@ -0,0 +1,63 @@
+<?php
+/**
+*
+* @name World of Tanks 
+* @description News about the tank slaughter game. Language can be fr, ?
+* @update 26/03/2014
+* @use1(lang="Searched language",category="Category id")
+*/
+define('WORLD_OF_TANKS', 'http://worldoftanks.eu/');
+define('NEWS', '/news/');
+class WorldOfTanks extends HttpCachingBridgeAbstract{
+    private $lang = "fr";
+    private $uri = WORLD_OF_TANKS;
+    private $name = 'World of tanks news';
+
+    public function collectData(array $param){
+        if (!empty($param['lang'])) {
+            $this->lang = $param['lang'];
+        }
+        if(empty($param['category'])) {
+            $this->uri = WORLD_OF_TANKS.$this->lang.NEWS;
+        } else {
+            $this->uri = WORLD_OF_TANKS.$this->lang.NEWS.$param['category']."/";
+        }
+        $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404);
+        $this->message("loaded HTML from ".$this->getURI());
+        // customize name 
+        $this->name = $html->find('title', 0)->innertext;
+        foreach($html->find('.b-imgblock_ico') as $infoLink) {
+            $this->parseLine($infoLink);
+       }
+    }
+    
+    public function parseLine($infoLink) {
+        $item = new Item();
+        $item->uri = WORLD_OF_TANKS.$infoLink->href;
+        // now load that uri from cache
+//        $this->message("loading page ".$item->uri);
+        $articlePage = str_get_html($this->get_cached($item->uri));
+        $content = $articlePage->find('.l-content', 0);
+        $this->defaultImageSrcTo($content, WORLD_OF_TANKS);
+        $item->title = $content->find('h1', 0)->innertext;
+        $item->content = $content->find('.b-content', 0)->innertext;
+//        $item->name = $auteur->innertext;
+        $item->timestamp = $content->find('.b-statistic_time', 0)->getAttribute("data-timestamp");
+        $this->items[] = $item;
+    }
+
+    public function getName(){
+        return $this->name;
+    }
+
+    public function getURI(){
+        return $this->uri;
+    }
+
+    public function getCacheDuration(){
+        return 3600; // 2h hours
+    }
+    public function getDescription(){
+        return "Toutes les actualités les plus brulantes de ce simulateur de destruction d'acier.";
+    }
+}
--- a/index.php
+++ b/index.php
@ -88,9 +88,11 @@ try{

                    // Data retrieval
                    $bridge = Bridge::create($bridge);
-                    $bridge
-                        ->setCache($cache) // Comment this lign for avoid cache use
-                        ->setDatas($_REQUEST);
+                    if(isset($_REQUEST["disable_cache"])) {
+                    } else {
+                        $bridge->setCache($cache); // just add disable cache to your query to disable caching
+                    }
+                    $bridge->setDatas($_REQUEST);

                    // Data transformation
                    $format = Format::create($format);
--- a/lib/Bridge.php
+++ b/lib/Bridge.php
@ -71,6 +71,102 @@ abstract class BridgeAbstract implements BridgeInterface{

        return $this;
    }
+
+    /**
+     * Set default image SRC attribute to point on given server when none is provided (that's to say when image src starts with '/'
+     */
+    public function defaultImageSrcTo($content, $server) {
+        foreach($content->find('img') as $image) {
+            if(strpos($image->src, '/')==0) {
+                $image->src = $server.$image->src;
+            }
+        }
+    }
+}
+
+/**
+ * Extension of BridgeAbstract allowing caching of files downloaded over http files.
+ * This is specially useful for sites from Gawker or Liberation networks, which allow pages excerpts top be viewed together on index, while full pages have to be downloaded
+ * separately.
+ * This class mainly provides a get_cached method which will will download the file from its remote location.
+ * TODO allow file cache invalidation by touching files on access, and removing files/directories which have not been touched since ... a long time
+ * After all, rss-bridge is not respaw, isn't it ?
+ */
+abstract class HttpCachingBridgeAbstract extends BridgeAbstract {
+    
+    /**
+     * Maintain locally cached versions of pages to download to avoid multiple doiwnloads.
+     * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache
+     * @param url url to cache
+     * @return content of file as string
+     */
+    public function get_cached($url) {
+        $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url);
+		// TODO build this from the variable given to Cache
+		$pageCacheDir = __DIR__ . '/../cache/'."pages/";
+        $filename =  $pageCacheDir.$simplified_url;
+        if (substr($filename, -1) == '/') {
+            $filename = $filename."index.html";
+        }
+        if(file_exists($filename)) {
+//            $this->message("loading cached file from ".$filename." for page at url ".$url);
+			// TODO touch file and its parent, and try to do neighbour deletion
+            $this->refresh_in_cache($pageCacheDir, $filename);
+		} else {
+//            $this->message("we have no local copy of ".$url." Downloading to ".$filename);
+            $dir = substr($filename, 0, strrpos($filename, '/'));
+            if(!is_dir($dir)) {
+//				$this->message("creating directories for ".$dir);
+                mkdir($dir, 0777, true);
+            }
+            $this->download_remote($url, $filename);
+        }
+        return file_get_contents($filename);
+    }
+    
+    private function refresh_in_cache($pageCacheDir, $filename) {
+		$currentPath = $filename;
+		while(!$pageCacheDir==$currentPath) {
+			touch($currentPath);
+			$currentPath = dirname($currentPath);
+		}
+    }
+
+    public function download_remote($url , $save_path) {
+        $f = fopen( $save_path , 'w+');
+        if($f) {
+            $handle = fopen($url , "rb");
+            if($handle) {
+                while (!feof($handle)) {
+                    $contents = fread($handle, 8192);
+                    if($contents) {
+                        fwrite($f , $contents);
+                    }
+                }
+                fclose($handle);
+            }
+            fclose($f);
+        }
+    }
+    
+    public function remove_from_cache($url) {
+        $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url);
+    	// TODO build this from the variable given to Cache
+		$pageCacheDir = __DIR__ . '/../cache/'."pages/";
+        $filename =  realpath($pageCacheDir.$simplified_url);
+        $this->message("removing from cache \"".$filename."\" WELL, NOT REALLY");
+        // filename is NO GOOD
+//        unlink($filename);
+    }
+    
+    public function message($text) {
+        $backtrace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 3);
+        $calling = $backtrace[2];
+        $message = $calling["file"].":".$calling["line"]
+            ." class ".get_class($this)."->".$calling["function"]
+            ." - ".$text;
+        error_log($message);
+    }
 }

 class Bridge{
@ -92,9 +188,9 @@ class Bridge{
        }

        $pathBridge = self::getDir() . $nameBridge . '.php';
-
+        
        if( !file_exists($pathBridge) ){
-            throw new \Exception('The bridge you looking for does not exist.');
+            throw new \Exception('The bridge you looking for does not exist. It should be at path '.$pathBridge);
        }

        require_once $pathBridge;