Bridges/MainPostBridge.php

<?php
class MainPostBridge extends BridgeAbstract {
    const NAME = 'Main-Post Nachrichten';
    const URI = 'https://www.mainpost.de/';
    const DESCRIPTION = 'Nachrichten und Artikel von der Main-Post';
    const MAINTAINER = 'Akamaru';
    const CACHE_TIMEOUT = 3600; // 1 Stunde

    public function getIcon()
    {
        return 'https://www.google.com/s2/favicons?domain=www.mainpost.de&sz=32';
    }
    const PARAMETERS = [
        'Regionen' => [
            'ort' => [
                'name' => 'Ort',
                'type' => 'list',
                'title' => 'Wähle den Ort',
                'required' => true,
                'values' => [
                    'Alles von Main-Post' => [
                        'Alle Nachrichten' => '',
                    ],
                    'Bad Kissingen' => [
                        'Alle Nachrichten' => 'bad-kissingen/alle-nachrichten',
                        'Bad Brückenau' => 'bad-kissingen/bad-brueckenau',
                        'Bad Kissingen' => 'bad-kissingen/bad-kissingen',
                        'Hammelburg' => 'bad-kissingen/hammelburg',
                        'Münnerstadt' => 'bad-kissingen/muennerstadt',
                    ],
                    'Hassberge' => [
                        'Alle Nachrichten' => 'hassberge/alle-nachrichten',
                    ],
                    'Kitzingen' => [
                        'Alle Nachrichten' => 'kitzingen/alle-nachrichten',
                    ],
                    'Main-Spessart' => [
                        'Alle Nachrichten' => 'main-spessart/alle-nachrichten',
                        'Gemünden' => 'main-spessart/gemuenden',
                        'Karlstadt' => 'main-spessart/karlstadt',
                        'Lohr' => 'main-spessart/lohr',
                        'Marktheidenfeld' => 'main-spessart/marktheidenfeld',
                    ],
                    'Main-Tauber' => [
                        'Alle Nachrichten' => 'main-tauber/alle-nachrichten',
                    ],
                    'Rhön-Grabfeld' => [
                        'Alle Nachrichten' => 'rhoengrabfeld/alle-nachrichten',
                        'Bad Königshofen' => 'rhoengrabfeld/bad-koenigshofen',
                        'Bad Neustadt' => 'rhoengrabfeld/bad-neustadt',
                        'Mellrichstadt' => 'rhoengrabfeld/mellrichstadt',
                    ],
                    'Schweinfurt' => [
                        'Alle Nachrichten' => 'schweinfurt/alle-nachrichten',
                        'Gerolzhofen' => 'schweinfurt/gerolzhofen',
                        'Schweinfurt' => 'schweinfurt/stadtschweinfurt',
                    ],
                    'Würzburg' => [
                        'Alle Nachrichten' => 'wuerzburg/alle-nachrichten',
                        'Ochsenfurt' => 'wuerzburg/ochsenfurt',
                        'Würzburg' => 'wuerzburg/stadtwuerzburg',
                    ],
                ]
            ]
        ]
    ];

    private function extractArticlesFromJSON($html) {
        $articles = [];
        $debugInfo = '';

        // Debug: Speichere die ersten 1000 Zeichen des HTML
        $debugInfo .= "HTML sample (first 1000 chars): " . substr($html, 0, 1000) . "\n\n";

        // Verschiedene Muster für JSON-LD Daten testen
        $patterns = [
            '/<script type="application\/ld\+json">\s*(.*?)\s*<\/script>/s',
            '/<script type="application\/ld\+json" id="[^"]*">\s*(.*?)\s*<\/script>/s',
            '/<script type=\'application\/ld\+json\'>\s*(.*?)\s*<\/script>/s'
        ];

        foreach ($patterns as $pattern) {
            if (preg_match_all($pattern, $html, $matches)) {
                $debugInfo .= "JSON Pattern matched: " . $pattern . "\n";
                $debugInfo .= "Found " . count($matches[1]) . " JSON blocks\n";

                foreach ($matches[1] as $index => $jsonStr) {
                    $debugInfo .= "JSON block $index (first 300 chars): " . substr($jsonStr, 0, 300) . "...\n";
                    $data = json_decode($jsonStr, true);
                    if (json_last_error() !== JSON_ERROR_NONE) {
                        $debugInfo .= "JSON Error in block $index: " . json_last_error_msg() . "\n";
                        continue;
                    }
                    $debugInfo .= "JSON structure: " . print_r(array_keys($data), true) . "\n";

                    // NEU: Wenn $data ein numerisch indiziertes Array ist, alle Elemente prüfen
                    if (is_array($data) && array_keys($data) === range(0, count($data) - 1)) {
                        foreach ($data as $subIndex => $subData) {
                            // Prüfe auf hasPart
                            if (isset($subData['hasPart']) && is_array($subData['hasPart'])) {
                                $debugInfo .= "Found hasPart with " . count($subData['hasPart']) . " items in block $index/$subIndex\n";
                                foreach ($subData['hasPart'] as $article) {
                                    if (isset($article['@type']) && $article['@type'] === 'NewsArticle') {
                                        if (isset($article['image']) && !is_array($article['image'])) {
                                            $imageUrl = $article['image'];
                                            $article['image'] = [
                                                '@type' => 'ImageObject',
                                                'url' => $imageUrl
                                            ];
                                        }
                                        $articles[] = $article;
                                        $debugInfo .= "Added article from hasPart: " . $article['headline'] . "\n";
                                        $debugInfo .= "Article structure: " . print_r($article, true) . "\n";
                                    }
                                }
                            } elseif (isset($subData['@type']) && $subData['@type'] === 'NewsArticle') {
                                if (isset($subData['image']) && !is_array($subData['image'])) {
                                    $imageUrl = $subData['image'];
                                    $subData['image'] = [
                                        '@type' => 'ImageObject',
                                        'url' => $imageUrl
                                    ];
                                }
                                $debugInfo .= "Found direct NewsArticle in block $index/$subIndex\n";
                                $articles[] = $subData;
                            }
                        }
                    } else {
                        // Ursprüngliche Logik für assoziative Arrays
                        if (isset($data['hasPart']) && is_array($data['hasPart'])) {
                            $debugInfo .= "Found hasPart with " . count($data['hasPart']) . " items in block $index\n";
                            foreach ($data['hasPart'] as $article) {
                                if (isset($article['@type']) && $article['@type'] === 'NewsArticle') {
                                    if (isset($article['image']) && !is_array($article['image'])) {
                                        $imageUrl = $article['image'];
                                        $article['image'] = [
                                            '@type' => 'ImageObject',
                                            'url' => $imageUrl
                                        ];
                                    }
                                    $articles[] = $article;
                                    $debugInfo .= "Added article from hasPart: " . $article['headline'] . "\n";
                                    $debugInfo .= "Article structure: " . print_r($article, true) . "\n";
                                }
                            }
                        } else if (isset($data['@type']) && $data['@type'] === 'NewsArticle') {
                            if (isset($data['image']) && !is_array($data['image'])) {
                                $imageUrl = $data['image'];
                                $data['image'] = [
                                    '@type' => 'ImageObject',
                                    'url' => $imageUrl
                                ];
                            }
                            $debugInfo .= "Found direct NewsArticle in block $index\n";
                            $articles[] = $data;
                        } else if (isset($data[0]) && isset($data[0]['@type']) && $data[0]['@type'] === 'NewsArticle') {
                            $debugInfo .= "Found array of NewsArticles in block $index\n";
                            foreach ($data as $article) {
                                if (isset($article['@type']) && $article['@type'] === 'NewsArticle') {
                                    if (isset($article['image']) && !is_array($article['image'])) {
                                        $imageUrl = $article['image'];
                                        $article['image'] = [
                                            '@type' => 'ImageObject',
                                            'url' => $imageUrl
                                        ];
                                    }
                                    $articles[] = $article;
                                }
                            }
                        }
                    }
                }
            }
        }

        // Speichern der Debug-Informationen
        $this->debugJsonInfo = $debugInfo;

        return $articles;
    }

    // Alternative Methode zur Extraktion von Artikeln direkt aus HTML
    private function extractArticlesFromHTML($html) {
        $articles = [];
        $debugInfo = '';
        $dom = $html;

        // Verschiedene Artikel-Selektoren probieren
        $selectors = [
            'div.teaser, article',
            '.teaser-box',
            '.article-teaser',
            '.article-list-item',
            '.article-card',
            '.content-teaser'
        ];

        foreach ($selectors as $selector) {
            $teasers = $dom->find($selector);
            $debugInfo .= "Selector '$selector' found " . count($teasers) . " elements\n";

            if (count($teasers) > 0) {
                foreach ($teasers as $teaser) {
                    $article = [];

                    // Verschiedene Selektoren für Überschriften probieren
                    $headlineSelectors = ['a.headline-link', 'h2 a', 'h3 a', '.headline a', '.title a', 'a.title', 'h4 a', '.teaser-headline a'];
                    $headlineLink = null;

                    foreach ($headlineSelectors as $headlineSelector) {
                        $headlineLink = $teaser->find($headlineSelector, 0);
                        if ($headlineLink) {
                            $debugInfo .= "Found headline with selector: $headlineSelector\n";
                            break;
                        }
                    }

                    if ($headlineLink) {
                        $article['headline'] = trim($headlineLink->plaintext);
                        $article['url'] = $headlineLink->href;

                        // Relative URLs zu absoluten machen
                        if (strpos($article['url'], 'http') !== 0) {
                            $article['url'] = self::URI . ltrim($article['url'], '/');
                        }

                        // Bild suchen mit verschiedenen Methoden
                        $imgFound = false;

                        // 1. Direktes Bild im Teaser
                        $img = $teaser->find('img', 0);
                        if ($img && isset($img->src) && !empty($img->src)) {
                            $article['image'] = [
                                '@type' => 'ImageObject',
                                'url' => $img->src
                            ];
                            $imgFound = true;
                        }

                        // 2. Bild im Figure-Tag
                        if (!$imgFound) {
                            $figure = $teaser->find('figure', 0);
                            if ($figure) {
                                $img = $figure->find('img', 0);
                                if ($img && isset($img->src) && !empty($img->src)) {
                                    $article['image'] = [
                                        '@type' => 'ImageObject',
                                        'url' => $img->src
                                    ];
                                    $imgFound = true;
                                }
                            }
                        }

                        // 3. Hintergrund-Bild im Style-Attribut
                        if (!$imgFound) {
                            $divWithStyle = $teaser->find('div[style*="background-image"]', 0);
                            if ($divWithStyle) {
                                if (preg_match('/background-image:\s*url\([\'"]?(.*?)[\'"]?\)/i', $divWithStyle->style, $matches)) {
                                    $article['image'] = [
                                        '@type' => 'ImageObject',
                                        'url' => $matches[1]
                                    ];
                                }
                            }
                        }

                        // Datum suchen
                        $timeSelectors = ['time', '.date', '.pubdate', '.time', '.timestamp'];
                        foreach ($timeSelectors as $timeSelector) {
                            $date = $teaser->find($timeSelector, 0);
                            if ($date) {
                                if (isset($date->datetime)) {
                                    $article['datePublished'] = $date->datetime;
                                } else {
                                    $article['datePublished'] = trim($date->plaintext);
                                }
                                break;
                            }
                        }

                        // Wenn kein Datum gefunden wird, aktuelles Datum verwenden
                        if (!isset($article['datePublished'])) {
                            $article['datePublished'] = date('Y-m-d\TH:i:s\Z');
                        }

                        $articles[] = $article;
                    }
                }

                // Wenn Artikel gefunden wurden, mit diesem Selektor aufhören
                if (count($articles) > 0) {
                    $debugInfo .= "Found " . count($articles) . " articles with selector '$selector'. Stopping search.\n";
                    break;
                }
            }
        }

        // Speichern der Debug-Informationen
        $this->debugHtmlInfo = $debugInfo;

        return $articles;
    }

    public function collectData() {
        $ort = $this->getInput('ort');

        // Erstelle URL basierend auf Ort
        if (empty($ort)) {
            // Wenn kein Ort ausgewählt wurde, nur die Region anzeigen
            $url = self::URI . $ort;
        } else {
            // Wenn ein Ort ausgewählt wurde, die spezifische URL erstellen
            $url = self::URI . $ort;
        }

        $html = getSimpleHTMLDOM($url);
        if (!$html) {
            throw new \Exception('Die Mainpost-Seite konnte nicht geladen werden: ' . $url);
        }

        // Debug-Info initialisieren
        $debugInfo = "URL: " . $url . "\n";

        // Save HTML content for debugging
        $htmlContent = $html->save();

        // Versuche Artikel aus JSON zu extrahieren
        $articles = $this->extractArticlesFromJSON($htmlContent);
        $debugInfo .= "JSON Extraction Info:\n" . $this->debugJsonInfo . "\n";

        // Debug: Anzahl der aus JSON extrahierten Artikel und Beispielstruktur
        $debugInfo .= "Number of articles extracted from JSON: " . count($articles) . "\n";
        if (!empty($articles)) {
            $debugInfo .= "First article structure (JSON): " . print_r($articles[0], true) . "\n";
        }

        // Wenn keine Artikel aus JSON gefunden wurden, Fehler werfen
        if (empty($articles)) {
            $htmlSnippet = substr($htmlContent, 0, 500) . '... [truncated] ...' . substr($htmlContent, -500);
            $debugInfo .= "HTML Sample:\n" . $htmlSnippet . "\n";
            throw new \Exception('Keine Artikel im JSON-LD gefunden. Debug-Info: ' . $debugInfo);
        }

        // Verarbeite die gefundenen Artikel
        foreach ($articles as $article) {
            $item = [];

            if (!isset($article['headline']) || !isset($article['url'])) {
                continue; // Überspringe unvollständige Artikel
            }

            $item['title'] = $article['headline'];
            $item['uri'] = $article['url'];

            // Debug: Zeige die Artikelstruktur
            $debugInfo .= "Processing article: " . print_r($article, true) . "\n";

            // Korrekte Timestamp-Verarbeitung für ISO 8601 Datum (2025-04-25T15:00:00Z)
            if (isset($article['datePublished'])) {
                $timestamp = strtotime($article['datePublished']);
                // Prüfen, ob das Parsen erfolgreich war
                if ($timestamp !== false) {
                    $item['timestamp'] = $timestamp;
                } else {
                    // Bei ungültigen Formaten aktuelles Datum verwenden
                    $item['timestamp'] = time();
                }
            } else {
                $item['timestamp'] = time();
            }

            // Description hinzufügen falls vorhanden
            if (isset($article['description'])) {
                $item['content'] = '<p>' . $article['description'] . '</p>';
            } else {
                $item['content'] = '';
            }

            // Bild korrekt hinzufügen
            if (isset($article['image'])) {
                // Immer auf das Objekt prüfen und nur image['url'] verwenden, falls vorhanden
                if (is_array($article['image']) && isset($article['image']['url'])) {
                    $imageUrl = $article['image']['url'];
                    $imageUrl = str_replace('\\/', '/', $imageUrl);
                    $item['content'] .= '<p><img src="' . $imageUrl . '" alt="' . $item['title'] . '"></p>';
                    $item['enclosures'] = [$imageUrl];
                } elseif (is_string($article['image'])) {
                    // Fallback: falls das Bild nur als String vorliegt
                    $imageUrl = str_replace('\\/', '/', $article['image']);
                    $item['content'] .= '<p><img src="' . $imageUrl . '" alt="' . $item['title'] . '"></p>';
                    $item['enclosures'] = [$imageUrl];
                }
            }

            // Autor hinzufügen (falls verfügbar)
            if (isset($article['author'])) {
                if (is_array($article['author'])) {
                    if (isset($article['author']['name'])) {
                        $item['author'] = $article['author']['name'];
                    }
                } else {
                    $item['author'] = $article['author'];
                }
            } else {
                $item['author'] = 'Main-Post';
            }

            // Kategorien hinzufügen
            $categories = [];
            if (preg_match('/\/([^\/]+)\/[^\/]+\/[^\/]+$/', $article['url'], $matches)) {
                $categories[] = $matches[1];
            }
            $item['categories'] = $categories;

            $this->items[] = $item;
        }

        // Deduplizierung von Artikeln basierend auf URL
        $this->items = array_map("unserialize", array_unique(array_map("serialize", $this->items)));
    }
}