nodeType === XML_TEXT_NODE) { $nextNode = $node->nextSibling; if (!$nextNode) { break; } $node = $nextNode; } } private function jumpToPreviousTag(&$node) { while ($node && $node->nodeType === XML_TEXT_NODE) { $previousNode = $node->previousSibling; if (!$previousNode) { break; } $node = $previousNode; } } public function collectData() { // Because the LWN page is written in loose HTML and not XHTML, // Simple HTML Dom is not accurate enough for the job $content = getContents($this->getURI()); $contents = explode('Page editor', $content); foreach ($contents as $content) { if (strpos($content, '') === false) { $content = << LWN{$content} EOD; } else { $content = $content . ''; } libxml_use_internal_errors(true); $html = new DOMDocument(); $html->loadHTML($content); libxml_clear_errors(); $edition = $html->getElementsByTagName('h1'); if ($edition->length !== 0) { $text = $edition->item(0)->textContent; $this->editionTimeStamp = strtotime( substr($text, strpos($text, 'for ') + strlen('for ')) ); } if (strpos($content, 'Cat1HL') === false) { $items = $this->getFeatureContents($html); } elseif (strpos($content, 'Cat3HL') === false) { $items = $this->getBriefItems($html); } else { $items = $this->getAnnouncements($html); } $this->items = array_merge($this->items, $items); } } private function getArticleContent(&$title) { $link = $title->firstChild; $this->jumpToNextTag($link); $item['uri'] = self::URI; if ($link->nodeName === 'a') { $item['uri'] .= $link->getAttribute('href'); } $item['timestamp'] = $this->editionTimeStamp; $node = $title; $content = ''; $contentEnd = false; while (!$contentEnd) { $node = $node->nextSibling; if ( !$node || ( $node->nodeType !== XML_TEXT_NODE && $node->nodeName === 'h2' || ( !is_null($node->attributes) && !is_null($class = $node->attributes->getNamedItem('class')) && in_array($class->nodeValue, ['Cat1HL','Cat2HL']) ) ) ) { $contentEnd = true; } else { $content .= $node->C14N(); } } $item['content'] = $content; return $item; } private function getFeatureContents(&$html) { $items = []; foreach ($html->getElementsByTagName('h3') as $title) { if ($title->getAttribute('class') !== 'SummaryHL') { continue; } $item = []; $author = $title->nextSibling; $this->jumpToNextTag($author); if ($author->getAttribute('class') === 'FeatureByline') { $item['author'] = $author->getElementsByTagName('b')->item(0)->textContent; } else { continue; } $item['title'] = $title->textContent; $items[] = array_merge($item, $this->getArticleContent($title)); } return $items; } private function getItemPrefix(&$cat, &$cats) { $cat1 = ''; $cat2 = ''; $cat3 = ''; switch ($cat->getAttribute('class')) { case 'Cat3HL': $cat3 = $cat->textContent; $cat = $cat->previousSibling; $this->jumpToPreviousTag($cat); $cats[2] = $cat3; if ($cat->getAttribute('class') !== 'Cat2HL') { break; } // fall-through? Looks like a bug case 'Cat2HL': $cat2 = $cat->textContent; $cat = $cat->previousSibling; $this->jumpToPreviousTag($cat); $cats[1] = $cat2; if (empty($cat3)) { $cats[2] = ''; } if ($cat->getAttribute('class') !== 'Cat1HL') { break; } // fall-through? Looks like a bug case 'Cat1HL': $cat1 = $cat->textContent; $cats[0] = $cat1; if (empty($cat3)) { $cats[2] = ''; } if (empty($cat2)) { $cats[1] = ''; } break; default: break; } $prefix = ''; if (!empty($cats[0])) { $prefix .= '[' . $cats[0] . ($cats[1] ? '/' . $cats[1] : '') . '] '; } return $prefix; } private function getAnnouncements(&$html) { $items = []; $cats = ['','','']; foreach ($html->getElementsByTagName('p') as $newsletters) { if ($newsletters->getAttribute('class') !== 'Cat3HL') { continue; } $item = []; $item['uri'] = self::URI . '#' . count($items); $item['timestamp'] = $this->editionTimeStamp; $item['author'] = 'LWN'; $cat = $newsletters->previousSibling; $this->jumpToPreviousTag($cat); $prefix = $this->getItemPrefix($cat, $cats); $item['title'] = $prefix . ' ' . $newsletters->textContent; $node = $newsletters; $content = ''; $contentEnd = false; while (!$contentEnd) { $node = $node->nextSibling; if ( !$node || ( $node->nodeType !== XML_TEXT_NODE && ( !is_null($node->attributes) && !is_null($class = $node->attributes->getNamedItem('class')) && in_array($class->nodeValue, ['Cat1HL','Cat2HL','Cat3HL']) ) ) ) { $contentEnd = true; } else { $content .= $node->C14N(); } } $item['content'] = $content; $items[] = $item; } foreach ($html->getElementsByTagName('h2') as $title) { if ($title->getAttribute('class') !== 'SummaryHL') { continue; } $item = []; $cat = $title->previousSibling; $this->jumpToPreviousTag($cat); $cat = $cat->previousSibling; $this->jumpToPreviousTag($cat); $prefix = $this->getItemPrefix($cat, $cats); $item['title'] = $prefix . ' ' . $title->textContent; $items[] = array_merge($item, $this->getArticleContent($title)); } return $items; } private function getBriefItems(&$html) { $items = []; $cats = ['','','']; foreach ($html->getElementsByTagName('h2') as $title) { if ($title->getAttribute('class') !== 'SummaryHL') { continue; } $item = []; $cat = $title->previousSibling; $this->jumpToPreviousTag($cat); $cat = $cat->previousSibling; $this->jumpToPreviousTag($cat); $prefix = $this->getItemPrefix($cat, $cats); $item['title'] = $prefix . ' ' . $title->textContent; $items[] = array_merge($item, $this->getArticleContent($title)); } return $items; } }