[CssSelectorBridge] Retrieve metadata for social media embeds (#3602, #3687) (#3706)

* [CssSelectorBridge] Metadata from social embed (#3602, #3687) Implement the following metadata sources: - Facebook Open Graph - Twitter <meta> tags - Standard <meta> tags - JSON linked data (ld+json) The following metadata is supported: - Canonical URL (may help removing garbage from URLs) - Article title - Truncated summary - Published/Updated timestamp - Enclosure/Thumbnail image - Author Name or Twitter handle SitemapBridge will also automatically benefit from this commit. * [php8backports] Add array_is_list() Needed this function for ld+json implementation in CssSelectorBridge. * [SitemapBridge] Add option to discard thumbnail * [CssSelectorBridge] Fix linting issues
2023-09-24 23:07:43 +02:00 · 2023-09-24 23:07:43 +02:00 · e1b911fc1f
parent 09f3c1532a
commit e1b911fc1f
3 changed files with 290 additions and 15 deletions
--- a/bridges/CssSelectorBridge.php
+++ b/bridges/CssSelectorBridge.php
@ -51,6 +51,11 @@ class CssSelectorBridge extends BridgeAbstract
                    EOT,
                'exampleValue' => ' | BlogName',
            ],
            'discard_thumbnail' => [
                'name' => '[Optional] Discard thumbnail set by site author',
                'title' => 'Some sites set their logo as thumbnail for every article. Use this option to discard it.',
                'type' => 'checkbox',
            ],
            'limit' => self::LIMIT
        ]
    ];
@ -82,6 +87,7 @@ class CssSelectorBridge extends BridgeAbstract
        $content_selector = $this->getInput('content_selector');
        $content_cleanup = $this->getInput('content_cleanup');
        $title_cleanup = $this->getInput('title_cleanup');
        $discard_thumbnail = $this->getInput('discard_thumbnail');
        $limit = $this->getInput('limit') ?? 10;
        $html = defaultLinkTo(getSimpleHTMLDOM($url), $url);
@ -92,13 +98,17 @@ class CssSelectorBridge extends BridgeAbstract
            $this->items = $items;
        } else {
            foreach ($items as $item) {
-                $this->items[] = $this->expandEntryWithSelector(
+                $item = $this->expandEntryWithSelector(
                    $item['uri'],
                    $content_selector,
                    $content_cleanup,
                    $title_cleanup,
                    $item['title']
                );
                if ($discard_thumbnail && isset($item['enclosures'])) {
                    unset($item['enclosures']);
                }
                $this->items[] = $item;
            }
        }
    }
@ -246,27 +256,272 @@ class CssSelectorBridge extends BridgeAbstract
        }
        $entry_html = getSimpleHTMLDOMCached($entry_url);
        $item = $this->entryHtmlRetrieveMetadata($entry_html);
        if (empty($item['uri'])) {
            $item['uri'] = $entry_url;
        }
        if (empty($item['title'])) {
            $article_title = $this->getPageTitle($entry_html, $title_cleanup);
            if (!empty($title_default) && (empty($article_title) || $article_title === $this->feedName)) {
                $article_title = $title_default;
            }
            $item['title'] = $article_title;
        }
        $article_content = $entry_html->find($content_selector);
        if (!empty($article_content)) {
            $article_content = $article_content[0];
-        } else {
+            $article_content = convertLazyLoading($article_content);
-            returnClientError('Could not find content selector at URL: ' . $entry_url);
+            $article_content = defaultLinkTo($article_content, $entry_url);
            $article_content = $this->cleanArticleContent($article_content, $content_cleanup);
            $item['content'] = $article_content;
        } else if (!empty($item['content'])) {
            $item['content'] .= '<br /><p><em>Could not extract full content, selector may need to be updated.</em></p>';
        }
-        $article_content = convertLazyLoading($article_content);
+        return $item;
-        $article_content = defaultLinkTo($article_content, $entry_url);
+    }
        $article_content = $this->cleanArticleContent($article_content, $content_cleanup);
        $article_title = $this->getPageTitle($entry_html, $title_cleanup);
        if (!empty($title_default) && (empty($article_title) || $article_title === $this->feedName)) {
            $article_title = $title_default;
        }
    /**
     * Retrieve metadata from entry HTML: title, author, date published, etc. from metadata intended for social media embeds and SEO
     * @param obj $entry_html DOM object representing the webpage HTML
     * @return array Entry data collected from Metadata
     */
    protected function entryHtmlRetrieveMetadata($entry_html)
    {
        $item = [];
-        $item['uri'] = $entry_url;
+
-        $item['title'] = $article_title;
+        // == First source of metadata: Meta tags ==
-        $item['content'] = $article_content;
+        // Facebook Open Graph (og:KEY) - https://developers.facebook.com/docs/sharing/webmasters
        // Twitter (twitter:KEY) - https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
        // Standard meta tags - https://www.w3schools.com/tags/tag_meta.asp
        // Each Entry field mapping defines a list of possible <meta> tags names that contains the expected value
        static $meta_mappings = [
            // <meta property="article:KEY" content="VALUE" />
            // <meta property="og:KEY" content="VALUE" />
            // <meta property="KEY" content="VALUE" />
            // <meta name="twitter:KEY" content="VALUE" />
            // <meta name="KEY" content="VALUE">
            // <link rel="canonical" href="URL" />
            'uri' => [
                'og:url',
                'twitter:url',
                'canonical'
            ],
            'title' => [
                'og:title',
                'twitter:title'
            ],
            'content' => [
                'og:description',
                'twitter:description',
                'description'
            ],
            'timestamp' => [
                'article:published_time',
                'releaseDate',
                'releasedate',
                'article:modified_time',
                'lastModified',
                'lastmodified'
            ],
            'enclosures' => [
                'og:image:secure_url',
                'og:image:url',
                'og:image',
                'twitter:image',
                'thumbnailImg',
                'thumbnailimg'
            ],
            'author' => [
                'author',
                'article:author',
                'article:author:username',
                'profile:first_name',
                'profile:last_name',
                'article:author:first_name',
                'article:author:last_name',
                'twitter:creator',
            ],
        ];
        $author_first_name = null;
        $author_last_name = null;
        // For each Entry property, look for corresponding HTML tags using a list of candidates
        foreach ($meta_mappings as $property => $field_list) {
            foreach ($field_list as $field) {
                // Look for HTML meta tag
                $element = null;
                if ($field === 'canonical') {
                    $element = $entry_html->find('link[rel=canonical]');
                } else {
                    $element = $entry_html->find("meta[property=$field], meta[name=$field]");
                }
                // Found something? Extract the value and populate Entry field
                if (!empty($element)) {
                    $element = $element[0];
                    $field_value = '';
                    if ($field === 'canonical') {
                        $field_value = $element->href;
                    } else {
                        $field_value = $element->content;
                    }
                    if (!empty($field_value)) {
                        if ($field === 'article:author:first_name' || $field === 'profile:first_name') {
                            $author_first_name = $field_value;
                        } else if ($field === 'article:author:last_name' || $field === 'profile:last_name') {
                            $author_last_name = $field_value;
                        } else {
                            $item[$property] = $field_value;
                            break; // Stop on first match, e.g. og:url has priority over canonical url.
                        }
                    }
                }
            }
        }
        // Populate author from first name and last name if all we have is nothing or Twitter @username
        if ((!isset($item['author']) || $item['author'][0] === '@') && (is_string($author_first_name) || is_string($author_last_name))) {
            $author = '';
            if (is_string($author_first_name)) {
                $author = $author_first_name;
            }
            if (is_string($author_last_name)) {
                $author = $author . ' ' . $author_last_name;
            }
            $item['author'] = trim($author);
        }
        // == Second source of metadata: Embedded JSON ==
        // JSON linked data - https://www.w3.org/TR/2014/REC-json-ld-20140116/
        // JSON linked data is COMPLEX and MAY BE LESS RELIABLE than <meta> tags. Used for fields not found as <meta> tags.
        // The implementation below will load all ld+json we can understand and attempt to extract relevant information.
        // ld+json object types that hold article metadata
        // Each mapping define item fields and a list of possible JSON field for this field
        // Each candiate JSON field is either a string (field name) or a list (path to nested field)
        static $ldjson_article_types = ['webpage', 'article', 'newsarticle', 'blogposting'];
        static $ldjson_article_mappings = [
            'uri' => ['url', 'mainEntityOfPage'],
            'title' => ['headline'],
            'content' => ['description'],
            'timestamp' => ['dateModified', 'datePublished'],
            'enclosures' => ['image'],
            'author' => [['author', 'name'], ['author', '@id'], 'author'],
        ];
        // ld+json object types that hold author metadata
        $ldjson_author_types = ['person', 'organization'];
        $ldjson_author_mappings = []; // ID => Name
        $ldjson_author_id = null;
        // Utility function for checking if JSON array matches one of the desired ld+json object types
        // A JSON object may have a single ld+json @type as a string OR several types at once as a list
        $ldjson_is_of_type = function ($json, $allowed_types) {
            if (isset($json['@type'])) {
                $json_types = $json['@type'];
                if (!is_array($json_types)) {
                    $json_types = [ $json_types ];
                }
                foreach ($json_types as $item_type) {
                    if (in_array(strtolower($item_type), $allowed_types)) {
                        return true;
                    }
                }
            }
            return false;
        };
        // Process ld+json objects embedded in the HTML DOM
        foreach ($entry_html->find('script[type=application/ld+json]') as $html_ldjson_node) {
            $json_raw = json_decode($html_ldjson_node->innertext, true);
            if (is_array($json_raw)) {
                // The JSON we just loaded may contain directly a single ld+json object AND/OR several ones under the '@graph' key
                $json_items = [ $json_raw ];
                if (isset($json_raw['@graph'])) {
                    foreach ($json_raw['@graph'] as $json_raw_sub_item) {
                        $json_items[] = $json_raw_sub_item;
                    }
                }
                // Now that we have a list of distinct JSON items, we can process them individually
                foreach ($json_items as $json) {
                    // JSON item that holds an ld+json Article object (or a variant)
                    if ($ldjson_is_of_type($json, $ldjson_article_types)) {
                        // For each item property, look for corresponding JSON fields and populate the item
                        foreach ($ldjson_article_mappings as $property => $field_list) {
                            // Skip fields already found as <meta> tags, except Twitter @username (because we might find a better name)
                            if (!isset($item[$property]) || ($property === 'author' && $item['author'][0] === '@')) {
                                foreach ($field_list as $field) {
                                    $json_root = $json;
                                    // If necessary, navigate inside the JSON object to access a nested field
                                    if (is_array($field)) {
                                        // At this point, $field = ['author', 'name'] and $json_root = {"author": {"name": "John Doe"}}
                                        $json_navigate_ok = true;
                                        while (count($field) > 1) {
                                            $sub_field = array_shift($field);
                                            if (array_key_exists($sub_field, $json_root)) {
                                                $json_root = $json_root[$sub_field];
                                                if (array_is_list($json_root) && count($json_root) === 1) {
                                                    $json_root = $json_root[0]; // Unwrap list of single item e.g. {"author":[{"name":"John Doe"}]}
                                                }
                                            } else {
                                                // Desired path not found in JSON, stop navigating
                                                $json_navigate_ok = false;
                                                break;
                                            }
                                        }
                                        if (!$json_navigate_ok) {
                                            continue; //Desired path not found in JSON, skip this field
                                        }
                                        $field = $field[0];
                                        // At this point, $field = "name" and $json_root = {"name": "John Doe"}
                                    }
                                    // Now we can check for desired field in JSON and populate $item accordingly
                                    if (isset($json_root[$field])) {
                                        $field_value = $json_root[$field];
                                        if (is_array($field_value)) {
                                            $field_value = $field_value[0]; // Different versions of the same enclosure? Take the first one
                                        }
                                        if (is_string($field_value) && !empty($field_value)) {
                                            if ($property === 'author' && $field === '@id') {
                                                $ldjson_author_id = $field_value; // Author is referred to by its ID: We'll see later if we can resolve it
                                            } else {
                                                $item[$property] = $field_value;
                                                break; // Stop on first match, e.g. {"author":{"name":"John Doe"}} has priority over {"author":"John Doe"}
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    // JSON item that holds an ld+json Author object (or a variant)
                    } else if ($ldjson_is_of_type($json, $ldjson_author_types)) {
                        if (isset($json['@id']) && isset($json['name'])) {
                            $ldjson_author_mappings[$json['@id']] = $json['name'];
                        }
                    }
                }
            }
        }
        // Attempt to resolve ld+json author if all we have is nothing or Twitter @username
        if ((!isset($item['author']) || $item['author'][0] === '@') && !is_null($ldjson_author_id) && isset($ldjson_author_mappings[$ldjson_author_id])) {
            $item['author'] = $ldjson_author_mappings[$ldjson_author_id];
        }
        // Adjust item field types
        if (isset($item['enclosures'])) {
            $item['enclosures'] = [ $item['enclosures'] ];
        }
        if (isset($item['timestamp'])) {
            $item['timestamp'] = strtotime($item['timestamp']);
        }
        return $item;
    }
 }
--- a/bridges/SitemapBridge.php
+++ b/bridges/SitemapBridge.php
@ -53,6 +53,11 @@ class SitemapBridge extends CssSelectorBridge
                    EOT,
                'exampleValue' => 'https://example.com/sitemap.xml',
            ],
            'discard_thumbnail' => [
                'name' => '[Optional] Discard thumbnail set by site author',
                'title' => 'Some sites set their logo as thumbnail for every article. Use this option to discard it.',
                'type' => 'checkbox',
            ],
            'limit' => self::LIMIT
        ]
    ];
@ -65,6 +70,7 @@ class SitemapBridge extends CssSelectorBridge
        $content_cleanup = $this->getInput('content_cleanup');
        $title_cleanup = $this->getInput('title_cleanup');
        $site_map = $this->getInput('site_map');
        $discard_thumbnail = $this->getInput('discard_thumbnail');
        $limit = $this->getInput('limit');
        $this->feedName = $this->getPageTitle($url, $title_cleanup);
@ -77,7 +83,11 @@ class SitemapBridge extends CssSelectorBridge
        }
        foreach ($links as $link) {
-            $this->items[] = $this->expandEntryWithSelector($link, $content_selector, $content_cleanup, $title_cleanup);
+            $item = $this->expandEntryWithSelector($link, $content_selector, $content_cleanup, $title_cleanup);
            if ($discard_thumbnail && isset($item['enclosures'])) {
                unset($item['enclosures']);
            }
            $this->items[] = $item;
        }
    }
--- a/lib/php8backports.php
+++ b/lib/php8backports.php
@ -54,3 +54,13 @@ if (!function_exists('str_contains')) {
        return $needle !== '' && mb_strpos($haystack, $needle) !== false;
    }
 }
 if (!function_exists('array_is_list')) {
    function array_is_list(array $arr)
    {
        if ($arr === []) {
            return true;
        }
        return array_keys($arr) === range(0, count($arr) - 1);
    }
 }