array( 'name' => 'Limit', 'type' => 'number', 'required' => false, 'defaultValue' => 20, 'title' => 'Maximum number of posts to return (max 20)' ) ) ); public function collectData() { $limit = $this->getInput('limit') ?? 20; // Cap at 20 posts maximum $limit = min($limit, 20); // Fetch the main blog page $html = getSimpleHTMLDOM(self::URI) or returnServerError('Could not fetch blog listing.'); $posts = []; // Find all links and dates in the main div $mainDiv = $html->find('div.main', 0); if (!$mainDiv) { returnServerError('Could not find blog post listing.'); } // Extract post links - looking for anchor tags within the main div $links = $mainDiv->find('a'); foreach ($links as $link) { $href = $link->href; // Filter for blog post links (exclude language variants like [zh]) if (strpos($href, '.html') !== false && strpos($link->plaintext, '[') === false) { $posts[] = [ 'title' => trim($link->plaintext), 'url' => $href ]; } } // Limit the number of posts $posts = array_slice($posts, 0, $limit); // Fetch full content for each post foreach ($posts as $post) { $postUrl = self::URI . '/' . $post['url']; $postData = $this->fetchPostContent($postUrl); $item = [ 'uri' => $postUrl, 'title' => $post['title'], 'timestamp' => $postData['timestamp'], 'author' => 'Anna', 'content' => $postData['tldr'] ]; $this->items[] = $item; } } private function fetchPostContent($url) { $html = getSimpleHTMLDOM($url); if (!$html) { return [ 'tldr' => 'Could not fetch post content.', 'timestamp' => null ]; } // Extract TLDR - can be either
$tldr = ''; $tldrElement = $html->find('.tldr', 0); if ($tldrElement) { // Try innertext first, fallback to plaintext if empty $tldr = trim($tldrElement->innertext); if (empty($tldr)) { $tldr = trim($tldrElement->plaintext); } } // Extract date from the main content // Date appears as "annas-archive.li/blog, 2025-12-20" $timestamp = null; $mainDiv = $html->find('div.main', 0); if ($mainDiv) { $mainText = $mainDiv->plaintext; if (preg_match('/(\d{4}-\d{2}-\d{2})/', $mainText, $matches)) { $timestamp = strtotime($matches[1]); } } return [ 'tldr' => $tldr, 'timestamp' => $timestamp ]; } }