[ 'url' => [ 'name' => 'Enter web archive URL', 'title' => <<<"EOL" Specify the URL from the archive page where all the archive are listed by month. EOL , 'type' => 'text', 'exampleValue' => 'https://mailman.nginx.org/pipermail/nginx-announce/', 'required' => true ], 'limit' => [ 'name' => 'Limit', 'type' => 'number', 'title' => 'Maximum number of items to return', 'defaultValue' => 5, ], ], ]; public function collectData() { $mails = []; $url = $this->getInput('url'); $limit = $this->getInput('limit'); $html = defaultLinkTo(getSimpleHTMLDOMCached($url, 1800), $url); // Fetch archive urls from the frontpage $archives = []; foreach ($html->find('tr') as $key => $tr) { $archiveUrl = $tr->find('a[href$="date.html"]', 0); $downloadUrl = $tr->find('a[href$=".txt"], a[href$=".txt.gz"]', 0); $archives[$key] = [ 'bydate' => $archiveUrl ? $archiveUrl->getAttribute('href') : null, 'download' => $downloadUrl ? $downloadUrl->getAttribute('href') : null ]; } foreach ($archives as $archive) { if (!$archive['bydate']) { continue; } // Fetch urls to mails $parent = pathinfo($archive['bydate'], PATHINFO_DIRNAME) . '/'; $html = defaultLinkTo(getSimpleHTMLDOMCached($archive['bydate'], 1800), $parent); $links = array_map(function ($val) { return $val->getAttribute('href'); }, $html->find('ul', 1)->find('li a[href$=".html"]')); $mailUrls = array_reverse($links); // Parse mbox $data = getContents($archive['download']); if (str_ends_with($archive['download'], '.gz')) { $data = \gzdecode($data, (1024 ** 2) * 25); // 25M if ($data === false) { throw new \Exception('Failed to gzdecode'); } } $mboxParts = preg_split('/^From /', $data); // Drop the first element which is always an empty string array_shift($mboxParts); $mboxMails = array_reverse($mboxParts); foreach ($mboxMails as $index => $content) { // Match Urls with contents from txt files. // Urls cannot be reconstructed from the txt content. $mails[] = [ 'url' => $mailUrls[$index], 'content' => $content ]; } if (count($mails) > $limit) { break; } } $pluck = function ($header, $mail) { // Not necessary to escape the header here $pattern = sprintf('/(?<=%s:).*$/m', $header); if (preg_match($pattern, $mail, $m)) { return trim(\mb_decode_mimeheader($m[0])); } return null; }; foreach (array_slice($mails, 0, $limit) as $mail) { $item = []; $item['uid'] = $pluck('Message-ID', $mail['content']); $item['uri'] = $mail['url']; $item['title'] = $pluck('Subject', $mail['content']); $item['author'] = preg_replace('/\sat\s/', '@', $pluck('From', $mail['content'])); $item['timestamp'] = $pluck('Date', $mail['content']); $item['content'] = nl2br(self::render($mail['content'])); $this->items[] = $item; } } /** * Parse mbox mail. Render some useful html. * * Based on https://gist.github.com/jbroadway/2836900 */ private static function render($text) { $rules = [ '/[\s\S]*?^Message-ID:[\s\S]*?>\n\n/m' => '', // Metadata '/-+\s+next part\s+-+[\s\S]+?(?=^$|\Z)/m' => '', // next part '/(? '$0', // links '/(\w+) <(.*) at (.*)>/' => '$1', // emails '/(\*|\*\*|__)(.*?)\1/' => '\2', // bold // blockquotes '/(.*)\s+((?:^>.*+\n)+)/m' => function ($regs) { return sprintf( '
%s
%s
', $regs[1], preg_replace('/^>/m', '', $regs[2]) ); }, ]; $text = "\n" . $text . "\n"; foreach ($rules as $regex => $replacement) { if (is_callable($replacement)) { $text = preg_replace_callback($regex, $replacement, $text); } else { $text = preg_replace($regex, $replacement, $text); } } return trim($text); } }