From bc773a49f86c233bd7aa146135d601518245bb77 Mon Sep 17 00:00:00 2001 From: Binnette Date: Sun, 8 May 2022 03:38:33 +0200 Subject: [PATCH] Full rewrite of bridge DeveloppezDotCom (#2689) --- bridges/DeveloppezDotComBridge.php | 420 ++++++++++++++++++++++++++--- 1 file changed, 390 insertions(+), 30 deletions(-) diff --git a/bridges/DeveloppezDotComBridge.php b/bridges/DeveloppezDotComBridge.php index 5719cf3f..1d3244b0 100644 --- a/bridges/DeveloppezDotComBridge.php +++ b/bridges/DeveloppezDotComBridge.php @@ -1,47 +1,407 @@ array( + 'name' => 'Max items', + 'type' => 'number', + 'defaultValue' => 5, + ), + // list of the differents RSS availables + 'domain' => array( + 'type' => 'list', + 'name' => 'Domaine', + 'title' => 'Chosissez un sous-domaine', + 'values' => array( + '= Domaine principal =' => 'www', + '4d' => '4d', + 'abbyy' => 'abbyy', + 'access' => 'access', + 'agile' => 'agile', + 'ajax' => 'ajax', + 'algo' => 'algo', + 'alm' => 'alm', + 'android' => 'android', + 'apache' => 'apache', + 'applications' => 'applications', + 'arduino' => 'arduino', + 'asm' => 'asm', + 'asp' => 'asp', + 'aspose' => 'aspose', + 'bacasable' => 'bacasable', + 'big-data' => 'big-data', + 'bpm' => 'bpm', + 'bsd' => 'bsd', + 'business-intelligence' => 'business-intelligence', + 'c' => 'c', + 'cloud-computing' => 'cloud-computing', + 'club' => 'club', + 'cms' => 'cms', + 'cpp' => 'cpp', + 'crm' => 'crm', + 'css' => 'css', + 'd' => 'd', + 'dart' => 'dart', + 'data-science' => 'data-science', + 'db2' => 'db2', + 'delphi' => 'delphi', + 'dotnet' => 'dotnet', + 'droit' => 'droit', + 'eclipse' => 'eclipse', + 'edi' => 'edi', + 'embarque' => 'embarque', + 'emploi' => 'emploi', + 'etudes' => 'etudes', + 'excel' => 'excel', + 'firebird' => 'firebird', + 'flash' => 'flash', + 'go' => 'go', + 'green-it' => 'green-it', + 'gtk' => 'gtk', + 'hardware' => 'hardware', + 'hpc' => 'hpc', + 'humour' => 'humour', + 'ibmcloud' => 'ibmcloud', + 'intelligence-artificielle' => 'intelligence-artificielle', + 'interbase' => 'interbase', + 'ios' => 'ios', + 'java' => 'java', + 'javascript' => 'javascript', + 'javaweb' => 'javaweb', + 'jetbrains' => 'jetbrains', + 'jeux' => 'jeux', + 'kotlin' => 'kotlin', + 'labview' => 'labview', + 'laravel' => 'laravel', + 'latex' => 'latex', + 'lazarus' => 'lazarus', + 'linux' => 'linux', + 'mac' => 'mac', + 'matlab' => 'matlab', + 'megaoffice' => 'megaoffice', + 'merise' => 'merise', + 'microsoft' => 'microsoft', + 'mobiles' => 'mobiles', + 'mongodb' => 'mongodb', + 'mysql' => 'mysql', + 'netbeans' => 'netbeans', + 'nodejs' => 'nodejs', + 'nosql' => 'nosql', + 'objective-c' => 'objective-c', + 'office' => 'office', + 'open-source' => 'open-source', + 'openoffice-libreoffice' => 'openoffice-libreoffice', + 'oracle' => 'oracle', + 'outlook' => 'outlook', + 'pascal' => 'pascal', + 'perl' => 'perl', + 'php' => 'php', + 'portail-emploi' => 'portail-emploi', + 'portail-projets' => 'portail-projets', + 'postgresql' => 'postgresql', + 'powerpoint' => 'powerpoint', + 'preprod-emploi' => 'preprod-emploi', + 'programmation' => 'programmation', + 'project' => 'project', + 'purebasic' => 'purebasic', + 'pyqt' => 'pyqt', + 'python' => 'python', + 'qt-creator' => 'qt-creator', + 'qt' => 'qt', + 'r' => 'r', + 'raspberry-pi' => 'raspberry-pi', + 'reseau' => 'reseau', + 'ruby' => 'ruby', + 'rust' => 'rust', + 'sap' => 'sap', + 'sas' => 'sas', + 'scilab' => 'scilab', + 'securite' => 'securite', + 'sgbd' => 'sgbd', + 'sharepoint' => 'sharepoint', + 'solutions-entreprise' => 'solutions-entreprise', + 'spring' => 'spring', + 'sqlserver' => 'sqlserver', + 'stages' => 'stages', + 'supervision' => 'supervision', + 'swift' => 'swift', + 'sybase' => 'sybase', + 'symfony' => 'symfony', + 'systeme' => 'systeme', + 'talend' => 'talend', + 'typescript' => 'typescript', + 'uml' => 'uml', + 'unix' => 'unix', + 'vb' => 'vb', + 'vba' => 'vba', + 'virtualisation' => 'virtualisation', + 'visualstudio' => 'visualstudio', + 'web-semantique' => 'web-semantique', + 'web' => 'web', + 'webmarketing' => 'webmarketing', + 'wind' => 'wind', + 'windows-azure' => 'windows-azure', + 'windows' => 'windows', + 'windowsphone' => 'windowsphone', + 'word' => 'word', + 'xhtml' => 'xhtml', + 'xml' => 'xml', + 'zend-framework' => 'zend-framework' + ), + ) + ) + ); - public function collectData(){ - $this->collectExpandableDatas(self::URI . 'index/rss', 15); + /** + * Return the RSS url for selected domain + */ + private function getRssUrl() + { + $domain = $this->getInput('domain'); + if (!empty($domain)) { + return 'https://' . $domain . self::DOMAIN . self::RSS_URL; + } + + return self::URI . self::RSS_URL; } - protected function parseItem($newsItem){ + /** + * Grabs the RSS item from Developpez.com + */ + public function collectData() + { + $url = $this->getRssUrl(); + $this->collectExpandableDatas($url, 20); + } + + /** + * Parse the content of every RSS item. And will try to get the full article + * pointed by the item URL intead of the default abstract. + */ + protected function parseItem($newsItem) + { + if (count($this->items) >= $this->getInput('limit')) { + return null; + } + + // This function parse each entry in the RSS with the default parse $item = parent::parseItem($newsItem); - $item['content'] = $this->extractContent($item['uri']); + + // There is a bug in Developpez RSS, coma are writtent as '~?' in the + // title, so I have to fix it manually + $item['title'] = $this->fixComaInTitle($item['title']); + + // We get the content of the full article behind the RSS item URL + $articleHTMLContent = getSimpleHTMLDOMCached($item['uri']); + + // Here we call our custom parser + $fullText = $this->extractFullText($articleHTMLContent); + if (!is_null($fullText)) { + // if we manage to parse the page behind the url of the RSS item + // then we set it as the new content. Otherwise we keep the default + // content to avoid RSS Bridge to return an empty item + $item['content'] = $fullText; + } + + // Now we will attach video url in item + $videosUrl = $this->getAllVideoUrl($articleHTMLContent); + if (!empty($videosUrl)) { + $item['enclosures'] = array_merge($item['enclosures'], $videosUrl); + } + + // Now we can look for the blog writer/creator + $author = $articleHTMLContent->find('[itemprop="creator"]', 0); + if (!empty($author)) { + $item['author'] = $author->outertext; + } + return $item; } - // F***ing quotes from Microsoft Word badly encoded, here was the trick: - // http://stackoverflow.com/questions/1262038/how-to-replace-microsoft-encoded-quotes-in-php - private function convertSmartQuotes($string) + /** + * Replace '~?' by a proper coma ',' + */ + private function fixComaInTitle($txt) { - $search = array(chr(145), - chr(146), - chr(147), - chr(148), - chr(151)); - - $replace = array( - "'", - "'", - '"', - '"', - '-' - ); - - return str_replace($search, $replace, $string); + return str_replace('~?', ',', $txt); } - private function extractContent($url){ - $articleHTMLContent = getSimpleHTMLDOMCached($url); - $text = $this->convertSmartQuotes($articleHTMLContent->find('div.content', 0)->innertext); - $text = utf8_encode($text); - return trim($text); + /** + * Return the full article pointed by the url in the RSS item + * Since Developpez.com only provides a short abstract of the article, we + * use the url to retrieve the complete article and return it as the content + */ + private function extractFullText($articleHTMLContent) + { + // All blog entry contains a div with the class 'content'. This div + // contains the complete blog article. But the RSS can also return + // announcement and not a blog article. So the next if, should take + // care of the "non blog" entry + $divArticleEntry = $articleHTMLContent->find('div.content', 0); + if (is_null($divArticleEntry)) { + // Didn't find the div with class content. It is probably not a blog + // entry. It is probably just an announcement for an ebook, a PDF, + // etc. So we can use the default RSS item content. + return null; + } + + // The following code is a bit hacky, but I really manage to get the + // full content of articles without any encoding issues. What is very + // weird and ugly in Developpez.com is the fact the some paragraphs of + // the article will be encoded as UTF-8 and some other paragraphs will + // be encoded as Windows-1252. So we can NOT decode the full article + // with only one encoding. We have to check every paragraph and + // determine its encoding + + // This contains all the 'paragraphs' of the article. It includes the + // pictures, the text and the links at the bottom of the article + $paragraphs = $divArticleEntry->nodes; + // This will store the complete decoded content + $fullText = ''; + + // For each paragraph, we will identify the encoding, then decode it + // and finally store the decoded content in $text + foreach ($paragraphs as $paragraph) { + // We have to recreate a new DOM document from the current node + // otherwise the find function will look in the complet article and + // not only in the current paragraph. This is an ugly behavior of + // the library Simple HTML DOM Parser... + $html = str_get_html($paragraph->outertext); + $fullText .= $this->decodeParagraph($html); + } + + // Finally we return the full 'well' enconded content of the article + return $fullText; + } + + /** + * + */ + private function decodeParagraph($p) + { + // First we check if this paragraph is a video + $videoUrl = $this->getVideoUrl($p); + if (!empty($videoUrl)) { + // If this is a video, we just return a link to the video + // 📺 => 🎞️ + return '

+ 📺 Voir la vidéo +

'; + } + + // We take outertext to get the complete paragraph not only the text + // inside it. That way we still graph block and so on. + $pTxt = $p->outertext; + // This will store the decoded text if we manage to decode it + $decodedTxt = ''; + + // This is the only way to properly decode each paragraph. I tried + // many stuffs but this is the only working way I found. + foreach (self::ENCONDINGS as $enc) { + // We check the encoding of the current paragraph + if (mb_check_encoding($pTxt, $enc)) { + // If the encoding is well recognized, we can convert from + // this encoding to UTF-8 + $decodedTxt = iconv($enc, 'UTF-8', $pTxt); + } + } + + // We should not trim the strings to avoid the to be glued to the + // text like: the softwarestartedto... + if (!empty($decodedTxt)) { + // We manage to decode the text, so we take the decoded version + return $this->formatParagraph($decodedTxt); + } else { + // Otherwise we take the non decoded version and hope it will + // be displayed not too ugly in the fulltext content + return $this->formatParagraph($pTxt); + } + } + + /** + * Return true in $txt is a HTML tag and not plain text + */ + private function isHtmlTagNotTxt($txt) + { + $html = str_get_html($txt); + return $html && $html->root && count($html->root->children) > 0; + } + + /** + * Will add a space before paragraph when needed + */ + private function formatParagraph($txt) + { + // If the paragraph is an html tag, we add a space before + if ($this->isHtmlTagNotTxt($txt)) { + // the first element is an html tag and not a text, so we can add a + // space before it + return ' ' . $txt; + } + // If the text start with word (not punctation), we had a space + $pattern = '/^\w/'; + if (preg_match($pattern, $txt)) { + return ' ' . $txt; + } + return $txt; + } + + /** + * Retrieve all video url in the article + */ + private function getAllVideoUrl($item) + { + // Array of video url + $url = array(); + + // Developpez use a div with the class video-container + $divsVideo = $item->find('div.video-container'); + if (empty($divsVideo)) { + return $url; + } + + // get the url of the video + foreach ($divsVideo as $div) { + $html = str_get_html($div->outertext); + $url[] = $this->getVideoUrl($html); + } + + return $url; + } + + /** + * Retrieve URL video. We have to check for the src of an iframe + * Work for Youtube. Will have to test for other video platform + */ + private function getVideoUrl($p) + { + $divVideo = $p->find('div.video-container', 0); + if (empty($divVideo)) { + return null; + } + $iframe = $divVideo->find('iframe', 0); + if (empty($iframe)) { + return null; + } + $src = trim($iframe->getAttribute('src')); + if (empty($src)) { + return null; + } + if (str_starts_with($src, '//')) { + $src = 'https:' . $src; + } + return $src; } }