[VkBridge] Rewrited bridge code (#667)

* [VkBridge] Convert special HTML entities to characters in pageName

* [VkBridge] Generate feed item title

* [VkBridge] Remove double backslashes in feed item link

* [VkBridge] Unpin post if pinned

* [VkBridge] Mark reposted messages

* [VkBridge] Correct external link parsing

* [VkBridge] Added article parsing

* [VkBridge] Added video parsing

* [VkBridge] Added photo parsing

* [VkBridge] Added album link parsing

* [VkBridge] Added one more external link selector

* [VkBridge] Using array of link selectors to remove

* [VkBridge] Added document parsing

* [VkBridge] Added sign parsing

* [VkBridge] Fixed incorrect sorting with pinned item

* [VkBridge] More methods to parse documents

* [VkBridge] Save fallback if page name element not found

* [VkBridge] Using post signed as feed item author

* [VkBridge] Fixed document link

* [VkBridge] Coding policy fixes
This commit is contained in:
Eugene Molotov 2018-04-16 14:55:31 +05:00 committed by Teromene
parent 208fff801d
commit 55f112e034
1 changed files with 204 additions and 19 deletions

View File

@ -43,40 +43,225 @@ class VkBridge extends BridgeAbstract
or returnServerError('No results for group or user name "' . $this->getInput('u') . '".'); or returnServerError('No results for group or user name "' . $this->getInput('u') . '".');
$text_html = iconv('windows-1251', 'utf-8', $text_html); $text_html = iconv('windows-1251', 'utf-8', $text_html);
// makes album link generating work correctly
$text_html = str_replace('"class="page_album_link">', '" class="page_album_link">', $text_html);
$html = str_get_html($text_html); $html = str_get_html($text_html);
$pageName = $html->find('.page_name', 0)->plaintext; $pageName = $html->find('.page_name', 0);
$this->pageName = $pageName; if (is_object($pageName)) {
$pageName = $pageName->plaintext;
$this->pageName = htmlspecialchars_decode($pageName);
}
$pinned_post_item = null;
$last_post_id = 0;
foreach ($html->find('.post') as $post) { foreach ($html->find('.post') as $post) {
$is_pinned_post = false;
if (strpos($post->getAttribute('class'), 'post_fixed') !== false) {
$is_pinned_post = true;
}
if (is_object($post->find('a.wall_post_more', 0))) { if (is_object($post->find('a.wall_post_more', 0))) {
//delete link "show full" in content //delete link "show full" in content
$post->find('a.wall_post_more', 0)->outertext = ''; $post->find('a.wall_post_more', 0)->outertext = '';
} }
$content_suffix = "";
// looking for external links
$external_link_selectors = array(
'a.page_media_link_title',
'div.page_media_link_title > a',
'div.media_desc > a.lnk',
);
foreach($external_link_selectors as $sel) {
if (is_object($post->find($sel, 0))) {
$a = $post->find($sel, 0);
$innertext = $a->innertext;
$parsed_url = parse_url($a->getAttribute('href'));
if (strpos($parsed_url['path'], '/away.php') !== 0) continue;
parse_str($parsed_url["query"], $parsed_query);
$content_suffix .= "<br>External link: <a href='" . $parsed_query["to"] . "'>$innertext</a>";
}
}
// remove external link from content
$external_link_selectors_to_remove = array(
'div.page_media_thumbed_link',
'div.page_media_link_desc_wrap',
'div.media_desc > a.lnk',
);
foreach($external_link_selectors_to_remove as $sel) {
if (is_object($post->find($sel, 0))) {
$post->find($sel, 0)->outertext = '';
}
}
// looking for article
$article = $post->find("a.article_snippet", 0);
if (is_object($article)) {
$article_title = $article->find("div.article_snippet__title", 0)->innertext;
$article_author = $article->find("div.article_snippet__author", 0)->innertext;
$article_link = self::URI . ltrim($article->getAttribute('href'), '/');
$article_img_element_style = $article->find("div.article_snippet__image", 0)->getAttribute('style');
preg_match('/background-image: url\((.*)\)/', $article_img_element_style, $matches);
if (count($matches) > 0) {
$content_suffix .= "<br><img src='" . $matches[1] . "'>";
}
$content_suffix .= "<br>Article: <a href='$article_link'>$article_title ($article_author)</a>";
$article->outertext = '';
}
// get video on post
$video = $post->find('div.post_video_desc', 0);
if (is_object($video)) {
$video_title = $video->find('div.post_video_title', 0)->plaintext;
$video_link = self::URI . ltrim( $video->find('a.lnk', 0)->getAttribute('href'), '/' );
$content_suffix .= "<br>Video: <a href='$video_link'>$video_title</a>";
$video->outertext = '';
}
// get all photos
foreach($post->find('div.wall_text > a.page_post_thumb_wrap') as $a) {
$result = $this->getPhoto($a);
if ($result == null) continue;
$a->outertext = '';
$content_suffix .= "<br>$result";
}
// get albums
foreach($post->find('.page_album_wrap') as $el) {
$a = $el->find('.page_album_link', 0);
$album_title = $a->find('.page_album_title_text', 0)->getAttribute('title');
$album_link = self::URI . ltrim($a->getAttribute('href'), '/');
$el->outertext = '';
$content_suffix .= "<br>Album: <a href='$album_link'>$album_title</a>";
}
// get photo documents
foreach($post->find('a.page_doc_photo_href') as $a) {
$doc_link = self::URI . ltrim($a->getAttribute('href'), '/');
$doc_gif_label_element = $a->find(".page_gif_label", 0);
$doc_title_element = $a->find(".doc_label", 0);
if (is_object($doc_gif_label_element)) {
$gif_preview_img = backgroundToImg($a->find('.page_doc_photo', 0));
$content_suffix .= "<br>Gif: <a href='$doc_link'>$gif_preview_img</a>";
} else if (is_object($doc_title_element)) {
$doc_title = $doc_title_element->innertext;
$content_suffix .= "<br>Doc: <a href='$doc_link'>$doc_title</a>";
} else {
continue;
}
$a->outertext = '';
}
// get other documents
foreach($post->find('div.page_doc_row') as $div) {
$doc_title_element = $div->find("a.page_doc_title", 0);
if (is_object($doc_title_element)) {
$doc_title = $doc_title_element->innertext;
$doc_link = self::URI . ltrim($doc_title_element->getAttribute('href'), '/');
$content_suffix .= "<br>Doc: <a href='$doc_link'>$doc_title</a>";
} else {
continue;
}
$div->outertext = '';
}
// get sign
$post_author = $pageName;
foreach($post->find('a.wall_signed_by') as $a) {
$post_author = $a->innertext;
$a->outertext = '';
}
if (is_object($post->find('div.copy_quote', 0))) {
$copy_quote = $post->find('div.copy_quote', 0);
if ($copy_post_header = $copy_quote->find('div.copy_post_header', 0)) {
$copy_post_header->outertext = '';
}
$copy_quote_content = $copy_quote->innertext;
$copy_quote->outertext = "<br>Reposted: <br>$copy_quote_content";
}
$item = array(); $item = array();
$item['content'] = strip_tags(backgroundToImg($post->find('div.wall_text', 0)->innertext), '<br><img>'); $item['content'] = strip_tags(backgroundToImg($post->find('div.wall_text', 0)->innertext), '<br><img>');
$item['content'] .= $content_suffix;
if (is_object($post->find('a.page_media_link_title', 0))) {
$link = $post->find('a.page_media_link_title', 0)->getAttribute('href');
//external link in the post
$item['content'] .= "\n\rExternal link: "
. str_replace('/away.php?to=', '', urldecode($link));
}
//get video on post
if (is_object($post->find('span.post_video_title_content', 0))) {
$titleVideo = $post->find('span.post_video_title_content', 0)->plaintext;
$linkToVideo = self::URI . $post->find('a.page_post_thumb_video', 0)->getAttribute('href');
$item['content'] .= "\n\r {$titleVideo}: {$linkToVideo}";
}
// get post link // get post link
$item['uri'] = self::URI . $post->find('a.post_link', 0)->getAttribute('href'); $post_link = $post->find('a.post_link', 0)->getAttribute('href');
preg_match("/wall-?\d+_(\d+)/", $post_link, $preg_match_result);
$item['post_id'] = intval($preg_match_result[1]);
if (substr(self::URI, -1) == '/') {
$post_link = self::URI . ltrim($post_link, "/");
} else {
$post_link = self::URI . $post_link;
}
$item['uri'] = $post_link;
$item['timestamp'] = $this->getTime($post); $item['timestamp'] = $this->getTime($post);
$item['author'] = $pageName; $item['title'] = $this->getTitle($item['content']);
$item['author'] = $post_author;
if ($is_pinned_post) {
// do not append it now
$pinned_post_item = $item;
} else {
$last_post_id = $item['post_id'];
$this->items[] = $item; $this->items[] = $item;
}
} }
if (is_null($pinned_post_item)) {
return;
} else if (count($this->items) == 0) {
$this->items[] = $pinned_post_item;
} else if ($last_post_id < $pinned_post_item['post_id']) {
$this->items[] = $pinned_post_item;
usort($this->items, function ($item1, $item2) {
return $item2['post_id'] - $item1['post_id'];
});
}
}
private function getPhoto($a) {
$onclick = $a->getAttribute('onclick');
preg_match('/return showPhoto\(.+?({.*})/', $onclick, $preg_match_result);
if (count($preg_match_result) == 0) return;
$arg = htmlspecialchars_decode( str_replace('queue:1', '"queue":1', $preg_match_result[1]) );
$data = json_decode($arg, true);
if ($data == null) return;
$thumb = $data['temp']['base'] . $data['temp']['x_'][0] . ".jpg";
$original = '';
foreach(array('y_', 'z_', 'w_') as $key) {
if (!isset($data['temp'][$key])) continue;
$original = $data['temp']['base'] . $data['temp'][$key][0] . ".jpg";
}
if ($original) {
return "<a href='$original'><img src='$thumb'></a>";
} else {
return "<img src='$thumb'>";
}
}
private function getTitle($content)
{
preg_match('/^["\w\ \p{Cyrillic}\(\)\?#«»-]+/mu', htmlspecialchars_decode($content), $result);
if (count($result) == 0) return "untitled";
return $result[0];
} }
private function getTime($post) private function getTime($post)