Rewritten those bridge to use a new mechanism ! RSS expander will get data from the RSS feed and expand the truncated entries to their full values.

2015-03-18 17:42:55 +01:00 · 2015-03-18 17:42:55 +01:00 · ce28cfad22
parent fd71ceae82
commit ce28cfad22
5 changed files with 147 additions and 157 deletions
--- a/bridges/Freenews.php
+++ b/bridges/Freenews.php
@ -1,69 +1,37 @@
 <?php
 /**
 *
-* @name FreeNews 
-* @description Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). Ne rentrez pas d'id si vous voulez accéder aux actualités générales.
+* @name Freenews
+* @description Un site d'actualitÃ© pour les freenautes (mais ne parlant pas que de la freebox). Ne rentrez pas d'id si vous voulez accÃ©der aux actualitÃ©s gÃ©nÃ©rales.
 * @update 26/03/2014
 * @use1(id="Id de la rubrique (sans le '-')")
 */
-define('FREENEWS', 'http://www.freenews.fr/');
-define('NEWS', FREENEWS.'spip.php?page=news');
-define('RUBRIQUE', FREENEWS.'spip.php?page=rubrique&id_rubrique=-');
-class FreeNews extends HttpCachingBridgeAbstract{
-    private $uri = NEWS;
-    private $name = 'Freenews';
-
+require_once 'bridges/RssExpander.php';
+define("RSS", 'http://feeds.feedburner.com/Freenews-Freebox?format=xml');
+class Freenews extends RssExpander {
    public function collectData(array $param){
-        if (!empty($param['id'])) {
-            $this->uri = RUBRIQUE.$param['id'];
-        }
-        $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404);
-//        $this->message("loaded HTML from ".$this->getURI());
-        // customize name 
-        $this->name = $html->find('title', 0)->innertext;
-        foreach($html->find('.news_line') as $newsLines) {
-            $this->parseLine($newsLines);
-       }
+        $param['url'] = RSS;
+        parent::collectData($param);
    }
    
-    public function parseLine($newsLines) {
-            foreach($newsLines->find('span') as $newsSpan) {
-                foreach($newsSpan->find('a') as $newsLink) {
+    protected function parseRSSItem($newsItem) {
        $item = new Item();
-                    $item->title = trim($newsLink->title);
-                    $item->uri = FREENEWS.$newsLink->href;
+        $item->title = trim($newsItem->title);
+//        $this->message("item has for title \"".$item->title."\"");
+        if(empty($newsItem->guid)) {
+            $item->uri = $newsItem->link;
+        } else {
+            $item->uri = $newsItem->guid;
+        }
        // now load that uri from cache
+//        $this->message("now loading page ".$item->uri);
        $articlePage = str_get_html($this->get_cached($item->uri));
-                    $content = $articlePage->find('.chapo', 0);
-                    foreach($content->find('img') as $image) {
-                        $image->src = FREENEWS.$image->src;
-                    }
-                    $redaction = $articlePage->find('.redac', 0);
-                    $rubrique = $redaction->find('a', 0);
-                    $auteur = $redaction->find('a', 1);
+
+        $content = $articlePage->find('.post-container', 0);
        $item->content = $content->innertext;
-                    $item->name = $auteur->innertext;
+        $item->name = $articlePage->find('a[rel=author]', 0)->innertext;
        // format should parse 2014-03-25T16:21:20Z. But, according to http://stackoverflow.com/a/10478469, it is not that simple
-                    $item->timestamp = DateTime::createFromFormat('Y-m-d\TH:i:s+', $redaction->title)->getTimestamp();
-                    $this->items[] = $item;
-                    // return after first link, as there are hidden treasures in those pages
-                    return;
-                }
-            }
-    }
-
-    public function getName(){
-        return $this->name;
-    }
-
-    public function getURI(){
-        return $this->uri;
-    }
-
-    public function getCacheDuration(){
-        return 3600; // 2h hours
-    }
-    public function getDescription(){
-        return "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). par rss-bridge";
+        $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem);
+        return $item;
    }
 }
--- a/bridges/Gawker.php
+++ b/bridges/Gawker.php
@ -2,44 +2,36 @@
 /**
 *
 * @name Gawker media
-* @description A bridge allowing access to any of the numerous Gawker media blogs (Lifehacker, deadspin, Kotaku, Jezebel, and so on
+* @description A bridge allowing access to any of the numerous Gawker media blogs (Lifehacker, deadspin, Kotaku, Jezebel, and so on. Notice you have to give its id to find the RSS stream in gawker maze
 * @update 27/03/2014
-* @use1(site="site")
+* @use1(site="site id to put in uri between feeds.gawker.com and /full .. which is obviously not full AT ALL")
 */
-class Gawker extends HttpCachingBridgeAbstract{
-	private $uri;
-	private $name;
+require_once 'bridges/RssExpander.php';
+define("RSS_PREFIX", "http://feeds.gawker.com/");
+define("RSS_SUFFIX", "/full");
+class Gawker extends RssExpander{
+    
+    private function toURI($name) {
+        return RSS_PREFIX.$name.RSS_SUFFIX;
+    }

    public function collectData(array $param){
        if (empty($param['site'])) {
 			trigger_error("If no site is provided, nothing is gonna happen", E_USER_ERROR);
        } else {
-			$this->uri = $param['site'];
-        }
-        $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404);
-        $this->message("loaded HTML from ".$this->getURI());
-        // customize name 
-        $this->name = $html->find('title', 0)->innertext;
-        foreach($html->find('.main-column') as $content) {
-            $this->parseContent($content);
+            $this->name = $param['site'];
+			$param['url'] = $this->toURI(strtolower($param['site']));
        }
+//        $this->message("loading feed from ".$this->getURI());
+        parent::collectData($param);
    }
    
-	public function parseContent($content) {
-		foreach($content->find('.headline') as $headline) {
-			foreach($headline->find('a') as $articleLink) {
-                // notice we only use article from this gawker site (as gawker like to see us visit other sites)
-                if(strpos($articleLink->href, $this->getURI())>=0) {
-    				$this->parseLink($articleLink);
-                }
-			}
-		}
-	}
-    
-    public function parseLink($infoLink) {
+    protected function parseRSSItem($newsItem) {
        $item = new Item();
-        $item->uri = $infoLink->href;
-        $item->title = $infoLink->innertext;
+        $item->uri = trim($newsItem->link);
+        $item->title = trim($newsItem->title);
+        $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem);
+//        $this->message("///////////////////////////////////////////////////////////////////////////////////////\nprocessing item ".var_export($item, true)."\n\n\nbuilt from\n\n\n".var_export($newsItem, true));
        try {
            // now load that uri from cache
 //            $this->message("loading page ".$item->uri);
@ -47,15 +39,15 @@ class Gawker extends HttpCachingBridgeAbstract{
            if(is_object($articlePage)) {
                $content = $articlePage->find('.post-content', 0);
                $this->defaultImageSrcTo($content, $this->getURI());
-                $item->content = $content->innertext;
-                // http://stackoverflow.com/q/22715928/15619
-                $publishtime = $articlePage->find('.publish-time', 0)->getAttribute("data-publishtime");
-                // don't know what I'm doing there, but http://www.epochconverter.com/programming/functions-php.php#epoch2date recommends it
-                $item->timestamp = $this->js_to_unix_timestamp($publishtime);
                $vcard = $articlePage->find('.vcard', 0);
                if(is_object($vcard)) {
-    				$item->name = $vcard->find('a', 0)->innertext;
+                    $authorLink = $vcard->find('a', 0);
+    				$item->name = $authorLink->innertext;
+                    // TODO use author link href to fill the feed info
                }
+//                $this->message("item quite loaded : ".var_export($item, true));
+                // I set item content as last element, for easier var_export reading
+                $item->content = $content->innertext;
            } else {
                throw new Exception("cache content for ".$item->uri." is NOT a Simple DOM parser object !");
            }
@ -65,25 +57,6 @@ class Gawker extends HttpCachingBridgeAbstract{
            $this->remove_from_cache($item->url);
            $item->content = $e->getMessage();
        }
-        $this->items[] = $item;
-    }
-
-	function js_to_unix_timestamp($jsTimestamp){
-	  return $jsTimestamp/1000; 
-	}	
-
-    public function getName(){
-        return $this->name;
-    }
-
-    public function getURI(){
-        return $this->uri;
-    }
-
-    public function getCacheDuration(){
-        return 3600; // 1h
-    }
-    public function getDescription(){
-        return "Gawker press blog content.";
+        return $item;
    }
 }
--- a/bridges/Les400Culs.php
+++ b/bridges/Les400Culs.php
@ -5,59 +5,40 @@
 * @description La planète sexe vue par Agnès Girard via rss-bridge
 * @update 20/02/2014
 */
+require_once 'bridges/RssExpander.php';
 define("SEXE", "http://sexes.blogs.liberation.fr");
-class Les400Culs extends HttpCachingBridgeAbstract{
+define("RSS", "http://sexes.blogs.liberation.fr/feeds/");
+/**
+ * As it seems that Les 400 culs currently offer a full feed, we won't change it content here.
+ * But I'm ready for the day where it will ... again ... provide some truncated content
+ */
+class Les400Culs extends RssExpander{

    public function collectData(array $param){
-        $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404);
-
-        foreach($html->find('#alpha-inner') as $articles) {
-            foreach($articles->find('div.entry') as $article) {
-                $header = $article->find('h3.entry-header a', 0);
-                $content = $article->find('div.entry-content', 0);
-
+        $param['url'] = RSS;
+        parent::collectData($param);
+    }
    
+    protected function parseRSSItem($newsItem) {
        $item = new Item();
-                $item->title = trim($header->innertext);
-                $item->uri = $header->href;
-                $item->name = "Agnès Girard";
-                // date is stored outside this node !
-                $dateHeader = $article->prev_sibling();
-                // http://stackoverflow.com/a/6239199/15619 (strtotime is typical amercian bullshit)
-                $item->timestamp = DateTime::createFromFormat('d/m/Y', $dateHeader->innertext)->getTimestamp();
-
-
-                $linkForMore = $content->find('p.entry-more-link a',0);
-                if($linkForMore==null) {
-                    $item->content = $content->innertext;
+        $item->title = trim($newsItem->title);
+//        $this->message("browsing item ".var_export($newsItem, true));
+        if(empty($newsItem->guid)) {
+            $item->uri = $newsItem->link;
        } else {
-                    $pageAddress = $linkForMore->href;
-                    $articlePage = str_get_html($this->get_cached($linkForMore->href));
-                    if($articlePage==null) {
-                        $item->content = $content->innertext."\n<p>".$linkForMore->outertext."</p>";
-                    } else {
-                        // TODO use some caching there !
-                        $fullContent = $articlePage->find('div.entry-content', 0);
-                        $item->content = $fullContent->innertext;
-                    }
-                }
-                $this->items[] = $item;
-            }
-       }
+            $item->uri = $newsItem->guid;
        }
+        // now load that uri from cache
+//        $this->message("now loading page ".$item->uri);
+//        $articlePage = str_get_html($this->get_cached($item->uri));

-    public function getName(){
-        return 'Les 400 Culs';
+//        $content = $articlePage->find('.post-container', 0);
+        $item->content = $newsItem->description;
+        $item->name = $newsItem->author;
+        $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem);
+        return $item;
    }
-
-    public function getURI(){
-        return SEXE;
-    }
-
    public function getCacheDuration(){
        return 7200; // 2h hours
    }
-    public function getDescription(){
-        return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge";
-    }
 }
--- a/bridges/RssExpander.php
+++ b/bridges/RssExpander.php
@ -0,0 +1,68 @@
+<?php
+
+/**
+ * A class providing facilities for RSS expansion. The goal here is to facilitate, as much as possible, writing bridges such as FreeNews, Gawker and other ones 
+ * @name RssExpander 
+ * @description Un bridge générique d'expansion automatique de contenu RSS ... pour tous ces sites qui ont un flux RSS mochement tonqué.
+ * @update 15/03/2015
+ * @use1(url="URL du flux dont vous souhaitez le contenu complet")
+ */
+ 
+abstract class RssExpander extends HttpCachingBridgeAbstract{
+    protected $name;
+    private $uri;
+    private $description;
+    public function collectData(array $param){
+        if (empty($param['url'])) {
+            $this->returnError('There is no $param[\'url\'] for this RSS expander', 404);
+        }
+ //       $this->message("Loading from ".$param['url']);
+        // Notice WE DO NOT use cache here on purpose : we want a fresh view of the RSS stream each time
+        $rssContent = simplexml_load_file($param['url']) or $this->returnError('Could not request '.$param['url'], 404);
+//        $this->message("loaded RSS from ".$param['url']);
+        // TODO insert RSS format detection
+        // we suppose for now, we have some RSS 2.0
+        $this->collect_RSS_2_0_data($rssContent);
+    }
+    
+    private function collect_RSS_2_0_data($rssContent) {
+        $rssContent = $rssContent->channel[0];
+//        $this->message("RSS content is ===========\n".var_export($rssContent, true)."===========");
+        $this->load_RSS_2_0_feed_data($rssContent);
+        foreach($rssContent->item as $item) {
+//            $this->message("parsing item ".var_export($item, true));
+            $this->items[] = $this->parseRSSItem($item);
+        }
+    }
+    
+    protected function RSS_2_0_time_to_timestamp($item)  {
+        return DateTime::createFromFormat('D, d M Y H:i:s e', $item->pubDate)->getTimestamp();
+    }
+    
+    // TODO set title, link, description, language, and so on
+    protected function load_RSS_2_0_feed_data($rssContent) {
+        $this->name = trim($rssContent->title);
+        $this->uri = trim($rssContent->link);
+        $this->description = trim($rssContent->description);
+    }
+    
+    /**
+     * Method should return, from a source RSS item given by lastRSS, one of our Items objects
+     * @param $item the input rss item
+     * @return a RSS-Bridge Item, with (hopefully) the whole content)
+     */
+    abstract protected function parseRSSItem($item);
+
+    
+    public function getName(){
+        return $this->name;
+    }
+
+    public function getURI(){
+        return $this->uri;
+    }
+    
+    public function getDescription() {
+        return $this->description;
+    }
+}
--- a/bridges/Sexactu.php
+++ b/bridges/Sexactu.php
@ -1,6 +1,6 @@
 <?php
 /**
-*
+* Unfortunatly, Sexactu do not provide a RSS stream, which prevents me from upgrading this to the mugnificent RssExpander framework
 * @name Sexactu
 * @description Sexactu via rss-bridge
 * @update 04/02/2014