From 55f112e0345e474b9f6508cd8c89c70bd1e9e5c5 Mon Sep 17 00:00:00 2001 From: Eugene Molotov Date: Mon, 16 Apr 2018 14:55:31 +0500 Subject: [PATCH] [VkBridge] Rewrited bridge code (#667) * [VkBridge] Convert special HTML entities to characters in pageName * [VkBridge] Generate feed item title * [VkBridge] Remove double backslashes in feed item link * [VkBridge] Unpin post if pinned * [VkBridge] Mark reposted messages * [VkBridge] Correct external link parsing * [VkBridge] Added article parsing * [VkBridge] Added video parsing * [VkBridge] Added photo parsing * [VkBridge] Added album link parsing * [VkBridge] Added one more external link selector * [VkBridge] Using array of link selectors to remove * [VkBridge] Added document parsing * [VkBridge] Added sign parsing * [VkBridge] Fixed incorrect sorting with pinned item * [VkBridge] More methods to parse documents * [VkBridge] Save fallback if page name element not found * [VkBridge] Using post signed as feed item author * [VkBridge] Fixed document link * [VkBridge] Coding policy fixes --- bridges/VkBridge.php | 223 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 204 insertions(+), 19 deletions(-) diff --git a/bridges/VkBridge.php b/bridges/VkBridge.php index e1529569..8be4bc60 100644 --- a/bridges/VkBridge.php +++ b/bridges/VkBridge.php @@ -43,40 +43,225 @@ class VkBridge extends BridgeAbstract or returnServerError('No results for group or user name "' . $this->getInput('u') . '".'); $text_html = iconv('windows-1251', 'utf-8', $text_html); + // makes album link generating work correctly + $text_html = str_replace('"class="page_album_link">', '" class="page_album_link">', $text_html); $html = str_get_html($text_html); - $pageName = $html->find('.page_name', 0)->plaintext; - $this->pageName = $pageName; + $pageName = $html->find('.page_name', 0); + if (is_object($pageName)) { + $pageName = $pageName->plaintext; + $this->pageName = htmlspecialchars_decode($pageName); + } + $pinned_post_item = null; + $last_post_id = 0; foreach ($html->find('.post') as $post) { + $is_pinned_post = false; + if (strpos($post->getAttribute('class'), 'post_fixed') !== false) { + $is_pinned_post = true; + } + if (is_object($post->find('a.wall_post_more', 0))) { //delete link "show full" in content $post->find('a.wall_post_more', 0)->outertext = ''; } + + $content_suffix = ""; + + // looking for external links + $external_link_selectors = array( + 'a.page_media_link_title', + 'div.page_media_link_title > a', + 'div.media_desc > a.lnk', + ); + + foreach($external_link_selectors as $sel) { + if (is_object($post->find($sel, 0))) { + $a = $post->find($sel, 0); + $innertext = $a->innertext; + $parsed_url = parse_url($a->getAttribute('href')); + if (strpos($parsed_url['path'], '/away.php') !== 0) continue; + parse_str($parsed_url["query"], $parsed_query); + $content_suffix .= "
External link: $innertext"; + } + } + + // remove external link from content + $external_link_selectors_to_remove = array( + 'div.page_media_thumbed_link', + 'div.page_media_link_desc_wrap', + 'div.media_desc > a.lnk', + ); + + foreach($external_link_selectors_to_remove as $sel) { + if (is_object($post->find($sel, 0))) { + $post->find($sel, 0)->outertext = ''; + } + } + + // looking for article + $article = $post->find("a.article_snippet", 0); + if (is_object($article)) { + $article_title = $article->find("div.article_snippet__title", 0)->innertext; + $article_author = $article->find("div.article_snippet__author", 0)->innertext; + $article_link = self::URI . ltrim($article->getAttribute('href'), '/'); + $article_img_element_style = $article->find("div.article_snippet__image", 0)->getAttribute('style'); + preg_match('/background-image: url\((.*)\)/', $article_img_element_style, $matches); + if (count($matches) > 0) { + $content_suffix .= "
"; + } + $content_suffix .= "
Article: $article_title ($article_author)"; + $article->outertext = ''; + } + + // get video on post + $video = $post->find('div.post_video_desc', 0); + if (is_object($video)) { + $video_title = $video->find('div.post_video_title', 0)->plaintext; + $video_link = self::URI . ltrim( $video->find('a.lnk', 0)->getAttribute('href'), '/' ); + $content_suffix .= "
Video: $video_title"; + $video->outertext = ''; + } + + // get all photos + foreach($post->find('div.wall_text > a.page_post_thumb_wrap') as $a) { + $result = $this->getPhoto($a); + if ($result == null) continue; + $a->outertext = ''; + $content_suffix .= "
$result"; + } + + // get albums + foreach($post->find('.page_album_wrap') as $el) { + $a = $el->find('.page_album_link', 0); + $album_title = $a->find('.page_album_title_text', 0)->getAttribute('title'); + $album_link = self::URI . ltrim($a->getAttribute('href'), '/'); + $el->outertext = ''; + $content_suffix .= "
Album: $album_title"; + } + + // get photo documents + foreach($post->find('a.page_doc_photo_href') as $a) { + $doc_link = self::URI . ltrim($a->getAttribute('href'), '/'); + $doc_gif_label_element = $a->find(".page_gif_label", 0); + $doc_title_element = $a->find(".doc_label", 0); + + if (is_object($doc_gif_label_element)) { + $gif_preview_img = backgroundToImg($a->find('.page_doc_photo', 0)); + $content_suffix .= "
Gif: $gif_preview_img"; + + } else if (is_object($doc_title_element)) { + $doc_title = $doc_title_element->innertext; + $content_suffix .= "
Doc: $doc_title"; + + } else { + continue; + + } + + $a->outertext = ''; + } + + // get other documents + foreach($post->find('div.page_doc_row') as $div) { + $doc_title_element = $div->find("a.page_doc_title", 0); + + if (is_object($doc_title_element)) { + $doc_title = $doc_title_element->innertext; + $doc_link = self::URI . ltrim($doc_title_element->getAttribute('href'), '/'); + $content_suffix .= "
Doc: $doc_title"; + + } else { + continue; + + } + + $div->outertext = ''; + } + + // get sign + $post_author = $pageName; + foreach($post->find('a.wall_signed_by') as $a) { + $post_author = $a->innertext; + $a->outertext = ''; + } + + if (is_object($post->find('div.copy_quote', 0))) { + $copy_quote = $post->find('div.copy_quote', 0); + if ($copy_post_header = $copy_quote->find('div.copy_post_header', 0)) { + $copy_post_header->outertext = ''; + } + $copy_quote_content = $copy_quote->innertext; + $copy_quote->outertext = "
Reposted:
$copy_quote_content"; + } + $item = array(); $item['content'] = strip_tags(backgroundToImg($post->find('div.wall_text', 0)->innertext), '
'); - - if (is_object($post->find('a.page_media_link_title', 0))) { - $link = $post->find('a.page_media_link_title', 0)->getAttribute('href'); - //external link in the post - $item['content'] .= "\n\rExternal link: " - . str_replace('/away.php?to=', '', urldecode($link)); - } - - //get video on post - if (is_object($post->find('span.post_video_title_content', 0))) { - $titleVideo = $post->find('span.post_video_title_content', 0)->plaintext; - $linkToVideo = self::URI . $post->find('a.page_post_thumb_video', 0)->getAttribute('href'); - $item['content'] .= "\n\r {$titleVideo}: {$linkToVideo}"; - } + $item['content'] .= $content_suffix; // get post link - $item['uri'] = self::URI . $post->find('a.post_link', 0)->getAttribute('href'); + $post_link = $post->find('a.post_link', 0)->getAttribute('href'); + preg_match("/wall-?\d+_(\d+)/", $post_link, $preg_match_result); + $item['post_id'] = intval($preg_match_result[1]); + if (substr(self::URI, -1) == '/') { + $post_link = self::URI . ltrim($post_link, "/"); + } else { + $post_link = self::URI . $post_link; + } + $item['uri'] = $post_link; $item['timestamp'] = $this->getTime($post); - $item['author'] = $pageName; - $this->items[] = $item; + $item['title'] = $this->getTitle($item['content']); + $item['author'] = $post_author; + if ($is_pinned_post) { + // do not append it now + $pinned_post_item = $item; + } else { + $last_post_id = $item['post_id']; + $this->items[] = $item; + } } + + if (is_null($pinned_post_item)) { + return; + } else if (count($this->items) == 0) { + $this->items[] = $pinned_post_item; + } else if ($last_post_id < $pinned_post_item['post_id']) { + $this->items[] = $pinned_post_item; + usort($this->items, function ($item1, $item2) { + return $item2['post_id'] - $item1['post_id']; + }); + } + } + + private function getPhoto($a) { + $onclick = $a->getAttribute('onclick'); + preg_match('/return showPhoto\(.+?({.*})/', $onclick, $preg_match_result); + if (count($preg_match_result) == 0) return; + + $arg = htmlspecialchars_decode( str_replace('queue:1', '"queue":1', $preg_match_result[1]) ); + $data = json_decode($arg, true); + if ($data == null) return; + + $thumb = $data['temp']['base'] . $data['temp']['x_'][0] . ".jpg"; + $original = ''; + foreach(array('y_', 'z_', 'w_') as $key) { + if (!isset($data['temp'][$key])) continue; + $original = $data['temp']['base'] . $data['temp'][$key][0] . ".jpg"; + } + + if ($original) { + return ""; + } else { + return ""; + } + } + + private function getTitle($content) + { + preg_match('/^["\w\ \p{Cyrillic}\(\)\?#«»-]+/mu', htmlspecialchars_decode($content), $result); + if (count($result) == 0) return "untitled"; + return $result[0]; } private function getTime($post)