From bc773a49f86c233bd7aa146135d601518245bb77 Mon Sep 17 00:00:00 2001
From: Binnette <binnette@gmail.com>
Date: Sun, 8 May 2022 03:38:33 +0200
Subject: [PATCH] Full rewrite of bridge DeveloppezDotCom (#2689)

---
 bridges/DeveloppezDotComBridge.php | 420 ++++++++++++++++++++++++++---
 1 file changed, 390 insertions(+), 30 deletions(-)

diff --git a/bridges/DeveloppezDotComBridge.php b/bridges/DeveloppezDotComBridge.php
index 5719cf3f..1d3244b0 100644
--- a/bridges/DeveloppezDotComBridge.php
+++ b/bridges/DeveloppezDotComBridge.php
@@ -1,47 +1,407 @@
 <?php
-class DeveloppezDotComBridge extends FeedExpander {
 
-	const MAINTAINER = 'polopollo';
+class DeveloppezDotComBridge extends FeedExpander
+{
+
+	const MAINTAINER = 'Binnette';
 	const NAME = 'Developpez.com Actus (FR)';
 	const URI = 'https://www.developpez.com/';
+	const DOMAIN = '.developpez.com/';
+	const RSS_URL = 'index/rss';
 	const CACHE_TIMEOUT = 1800; // 30min
-	const DESCRIPTION = 'Returns the 15 newest posts from DeveloppezDotCom (full text).';
+	const DESCRIPTION = 'Returns complete posts from developpez.com';
+	// Encodings used by Developpez.com in their articles body
+	const ENCONDINGS = array('Windows-1252', 'UTF-8');
+	const PARAMETERS = array(
+		array(
+			'limit' => array(
+				'name' => 'Max items',
+				'type' => 'number',
+				'defaultValue' => 5,
+			),
+			// list of the differents RSS availables
+			'domain' => array(
+				'type' => 'list',
+				'name' => 'Domaine',
+				'title' => 'Chosissez un sous-domaine',
+				'values' => array(
+					'= Domaine principal =' => 'www',
+					'4d' => '4d',
+					'abbyy' => 'abbyy',
+					'access' => 'access',
+					'agile' => 'agile',
+					'ajax' => 'ajax',
+					'algo' => 'algo',
+					'alm' => 'alm',
+					'android' => 'android',
+					'apache' => 'apache',
+					'applications' => 'applications',
+					'arduino' => 'arduino',
+					'asm' => 'asm',
+					'asp' => 'asp',
+					'aspose' => 'aspose',
+					'bacasable' => 'bacasable',
+					'big-data' => 'big-data',
+					'bpm' => 'bpm',
+					'bsd' => 'bsd',
+					'business-intelligence' => 'business-intelligence',
+					'c' => 'c',
+					'cloud-computing' => 'cloud-computing',
+					'club' => 'club',
+					'cms' => 'cms',
+					'cpp' => 'cpp',
+					'crm' => 'crm',
+					'css' => 'css',
+					'd' => 'd',
+					'dart' => 'dart',
+					'data-science' => 'data-science',
+					'db2' => 'db2',
+					'delphi' => 'delphi',
+					'dotnet' => 'dotnet',
+					'droit' => 'droit',
+					'eclipse' => 'eclipse',
+					'edi' => 'edi',
+					'embarque' => 'embarque',
+					'emploi' => 'emploi',
+					'etudes' => 'etudes',
+					'excel' => 'excel',
+					'firebird' => 'firebird',
+					'flash' => 'flash',
+					'go' => 'go',
+					'green-it' => 'green-it',
+					'gtk' => 'gtk',
+					'hardware' => 'hardware',
+					'hpc' => 'hpc',
+					'humour' => 'humour',
+					'ibmcloud' => 'ibmcloud',
+					'intelligence-artificielle' => 'intelligence-artificielle',
+					'interbase' => 'interbase',
+					'ios' => 'ios',
+					'java' => 'java',
+					'javascript' => 'javascript',
+					'javaweb' => 'javaweb',
+					'jetbrains' => 'jetbrains',
+					'jeux' => 'jeux',
+					'kotlin' => 'kotlin',
+					'labview' => 'labview',
+					'laravel' => 'laravel',
+					'latex' => 'latex',
+					'lazarus' => 'lazarus',
+					'linux' => 'linux',
+					'mac' => 'mac',
+					'matlab' => 'matlab',
+					'megaoffice' => 'megaoffice',
+					'merise' => 'merise',
+					'microsoft' => 'microsoft',
+					'mobiles' => 'mobiles',
+					'mongodb' => 'mongodb',
+					'mysql' => 'mysql',
+					'netbeans' => 'netbeans',
+					'nodejs' => 'nodejs',
+					'nosql' => 'nosql',
+					'objective-c' => 'objective-c',
+					'office' => 'office',
+					'open-source' => 'open-source',
+					'openoffice-libreoffice' => 'openoffice-libreoffice',
+					'oracle' => 'oracle',
+					'outlook' => 'outlook',
+					'pascal' => 'pascal',
+					'perl' => 'perl',
+					'php' => 'php',
+					'portail-emploi' => 'portail-emploi',
+					'portail-projets' => 'portail-projets',
+					'postgresql' => 'postgresql',
+					'powerpoint' => 'powerpoint',
+					'preprod-emploi' => 'preprod-emploi',
+					'programmation' => 'programmation',
+					'project' => 'project',
+					'purebasic' => 'purebasic',
+					'pyqt' => 'pyqt',
+					'python' => 'python',
+					'qt-creator' => 'qt-creator',
+					'qt' => 'qt',
+					'r' => 'r',
+					'raspberry-pi' => 'raspberry-pi',
+					'reseau' => 'reseau',
+					'ruby' => 'ruby',
+					'rust' => 'rust',
+					'sap' => 'sap',
+					'sas' => 'sas',
+					'scilab' => 'scilab',
+					'securite' => 'securite',
+					'sgbd' => 'sgbd',
+					'sharepoint' => 'sharepoint',
+					'solutions-entreprise' => 'solutions-entreprise',
+					'spring' => 'spring',
+					'sqlserver' => 'sqlserver',
+					'stages' => 'stages',
+					'supervision' => 'supervision',
+					'swift' => 'swift',
+					'sybase' => 'sybase',
+					'symfony' => 'symfony',
+					'systeme' => 'systeme',
+					'talend' => 'talend',
+					'typescript' => 'typescript',
+					'uml' => 'uml',
+					'unix' => 'unix',
+					'vb' => 'vb',
+					'vba' => 'vba',
+					'virtualisation' => 'virtualisation',
+					'visualstudio' => 'visualstudio',
+					'web-semantique' => 'web-semantique',
+					'web' => 'web',
+					'webmarketing' => 'webmarketing',
+					'wind' => 'wind',
+					'windows-azure' => 'windows-azure',
+					'windows' => 'windows',
+					'windowsphone' => 'windowsphone',
+					'word' => 'word',
+					'xhtml' => 'xhtml',
+					'xml' => 'xml',
+					'zend-framework' => 'zend-framework'
+				),
+			)
+		)
+	);
 
-	public function collectData(){
-		$this->collectExpandableDatas(self::URI . 'index/rss', 15);
+	/**
+	 * Return the RSS url for selected domain
+	 */
+	private function getRssUrl()
+	{
+		$domain = $this->getInput('domain');
+		if (!empty($domain)) {
+			return 'https://' . $domain . self::DOMAIN . self::RSS_URL;
+		}
+
+		return self::URI . self::RSS_URL;
 	}
 
-	protected function parseItem($newsItem){
+	/**
+	 * Grabs the RSS item from Developpez.com
+	 */
+	public function collectData()
+	{
+		$url = $this->getRssUrl();
+		$this->collectExpandableDatas($url, 20);
+	}
+
+	/**
+	 * Parse the content of every RSS item. And will try to get the full article
+	 * pointed by the item URL intead of the default abstract.
+	 */
+	protected function parseItem($newsItem)
+	{
+		if (count($this->items) >= $this->getInput('limit')) {
+			return null;
+		}
+
+		// This function parse each entry in the RSS with the default parse
 		$item = parent::parseItem($newsItem);
-		$item['content'] = $this->extractContent($item['uri']);
+
+		// There is a bug in Developpez RSS, coma are writtent as '~?' in the
+		// title, so I have to fix it manually
+		$item['title'] = $this->fixComaInTitle($item['title']);
+
+		// We get the content of the full article behind the RSS item URL
+		$articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);
+
+		// Here we call our custom parser
+		$fullText = $this->extractFullText($articleHTMLContent);
+		if (!is_null($fullText)) {
+			// if we manage to parse the page behind the url of the RSS item
+			// then we set it as the new content. Otherwise we keep the default
+			// content to avoid RSS Bridge to return an empty item
+			$item['content'] = $fullText;
+		}
+
+		// Now we will attach video url in item
+		$videosUrl = $this->getAllVideoUrl($articleHTMLContent);
+		if (!empty($videosUrl)) {
+			$item['enclosures'] = array_merge($item['enclosures'], $videosUrl);
+		}
+
+		// Now we can look for the blog writer/creator
+		$author = $articleHTMLContent->find('[itemprop="creator"]', 0);
+		if (!empty($author)) {
+			$item['author'] = $author->outertext;
+		}
+
 		return $item;
 	}
 
-	// F***ing quotes from Microsoft Word badly encoded, here was the trick:
-	// http://stackoverflow.com/questions/1262038/how-to-replace-microsoft-encoded-quotes-in-php
-	private function convertSmartQuotes($string)
+	/**
+	 * Replace '~?' by a proper coma ','
+	 */
+	private function fixComaInTitle($txt)
 	{
-		$search = array(chr(145),
-						chr(146),
-						chr(147),
-						chr(148),
-						chr(151));
-
-		$replace = array(
-			"'",
-			"'",
-			'"',
-			'"',
-			'-'
-		);
-
-		return str_replace($search, $replace, $string);
+		return str_replace('~?', ',', $txt);
 	}
 
-	private function extractContent($url){
-		$articleHTMLContent = getSimpleHTMLDOMCached($url);
-		$text = $this->convertSmartQuotes($articleHTMLContent->find('div.content', 0)->innertext);
-		$text = utf8_encode($text);
-		return trim($text);
+	/**
+	 * Return the full article pointed by the url in the RSS item
+	 * Since Developpez.com only provides a short abstract of the article, we
+	 * use the url to retrieve the complete article and return it as the content
+	 */
+	private function extractFullText($articleHTMLContent)
+	{
+		// All blog entry contains a div with the class 'content'. This div
+		// contains the complete blog article. But the RSS can also return
+		// announcement and not a blog article. So the next if, should take
+		// care of the "non blog" entry
+		$divArticleEntry = $articleHTMLContent->find('div.content', 0);
+		if (is_null($divArticleEntry)) {
+			// Didn't find the div with class content. It is probably not a blog
+			// entry. It is probably just an announcement for an ebook, a PDF,
+			// etc. So we can use the default RSS item content.
+			return null;
+		}
+
+		// The following code is a bit hacky, but I really manage to get the
+		// full content of articles without any encoding issues. What is very
+		// weird and ugly in Developpez.com is the fact the some paragraphs of
+		// the article will be encoded as UTF-8 and some other paragraphs will
+		// be encoded as Windows-1252. So we can NOT decode the full article
+		// with only one encoding. We have to check every paragraph and
+		// determine its encoding
+
+		// This contains all the 'paragraphs' of the article. It includes the
+		// pictures, the text and the links at the bottom of the article
+		$paragraphs = $divArticleEntry->nodes;
+		// This will store the complete decoded content
+		$fullText = '';
+
+		// For each paragraph, we will identify the encoding, then decode it
+		// and finally store the decoded content in $text
+		foreach ($paragraphs as $paragraph) {
+			// We have to recreate a new DOM document from the current node
+			// otherwise the find function will look in the complet article and
+			// not only in the current paragraph. This is an ugly behavior of
+			// the library Simple HTML DOM Parser...
+			$html = str_get_html($paragraph->outertext);
+			$fullText .= $this->decodeParagraph($html);
+		}
+
+		// Finally we return the full 'well' enconded content of the article
+		return $fullText;
+	}
+
+	/**
+	 *
+	 */
+	private function decodeParagraph($p)
+	{
+		// First we check if this paragraph is a video
+		$videoUrl = $this->getVideoUrl($p);
+		if (!empty($videoUrl)) {
+			// If this is a video, we just return a link to the video
+			// &#128250; => 🎞️
+			return	'<p>
+						<b>&#128250; <a href="' . $videoUrl . '">Voir la vidéo</a></b>
+					</p>';
+		}
+
+		// We take outertext to get the complete paragraph not only the text
+		// inside it. That way we still graph block <img> and so on.
+		$pTxt = $p->outertext;
+		// This will store the decoded text if we manage to decode it
+		$decodedTxt = '';
+
+		// This is the only way to properly decode each paragraph. I tried
+		// many stuffs but this is the only working way I found.
+		foreach (self::ENCONDINGS as $enc) {
+			// We check the encoding of the current paragraph
+			if (mb_check_encoding($pTxt, $enc)) {
+				// If the encoding is well recognized, we can convert from
+				// this encoding to UTF-8
+				$decodedTxt = iconv($enc, 'UTF-8', $pTxt);
+			}
+		}
+
+		// We should not trim the strings to avoid the <a> to be glued to the
+		// text like: the software<a href="...">started</a>to...
+		if (!empty($decodedTxt)) {
+			// We manage to decode the text, so we take the decoded version
+			return $this->formatParagraph($decodedTxt);
+		} else {
+			// Otherwise we take the non decoded version and hope it will
+			// be displayed not too ugly in the fulltext content
+			return $this->formatParagraph($pTxt);
+		}
+	}
+
+	/**
+	 * Return true in $txt is a HTML tag and not plain text
+	 */
+	private function isHtmlTagNotTxt($txt)
+	{
+		$html = str_get_html($txt);
+		return $html && $html->root && count($html->root->children) > 0;
+	}
+
+	/**
+	 * Will add a space before paragraph when needed
+	 */
+	private function formatParagraph($txt)
+	{
+		// If the paragraph is an html tag, we add a space before
+		if ($this->isHtmlTagNotTxt($txt)) {
+			// the first element is an html tag and not a text, so we can add a
+			// space before it
+			return ' ' . $txt;
+		}
+		// If the text start with word (not punctation), we had a space
+		$pattern = '/^\w/';
+		if (preg_match($pattern, $txt)) {
+			return ' ' . $txt;
+		}
+		return $txt;
+	}
+
+	/**
+	 * Retrieve all video url in the article
+	 */
+	private function getAllVideoUrl($item)
+	{
+		// Array of video url
+		$url = array();
+
+		// Developpez use a div with the class video-container
+		$divsVideo = $item->find('div.video-container');
+		if (empty($divsVideo)) {
+			return $url;
+		}
+
+		// get the url of the video
+		foreach ($divsVideo as $div) {
+			$html = str_get_html($div->outertext);
+			$url[] = $this->getVideoUrl($html);
+		}
+
+		return $url;
+	}
+
+	/**
+	 * Retrieve URL video. We have to check for the src of an iframe
+	 * Work for Youtube. Will have to test for other video platform
+	 */
+	private function getVideoUrl($p)
+	{
+		$divVideo = $p->find('div.video-container', 0);
+		if (empty($divVideo)) {
+			return null;
+		}
+		$iframe = $divVideo->find('iframe', 0);
+		if (empty($iframe)) {
+			return null;
+		}
+		$src = trim($iframe->getAttribute('src'));
+		if (empty($src)) {
+			return null;
+		}
+		if (str_starts_with($src, '//')) {
+			$src = 'https:' . $src;
+		}
+		return $src;
 	}
 }