1
0
Files
Bridges/MainPostBridge.php
2025-11-19 14:18:33 +01:00

422 lines
20 KiB
PHP

<?php
class MainPostBridge extends BridgeAbstract {
const NAME = 'Main-Post Nachrichten';
const URI = 'https://www.mainpost.de/';
const DESCRIPTION = 'Nachrichten und Artikel von der Main-Post';
const MAINTAINER = 'Akamaru';
const CACHE_TIMEOUT = 3600; // 1 Stunde
public function getIcon()
{
return 'https://www.google.com/s2/favicons?domain=www.mainpost.de&sz=32';
}
const PARAMETERS = [
'Regionen' => [
'ort' => [
'name' => 'Ort',
'type' => 'list',
'title' => 'Wähle den Ort',
'required' => true,
'values' => [
'Alles von Main-Post' => [
'Alle Nachrichten' => '',
],
'Bad Kissingen' => [
'Alle Nachrichten' => 'bad-kissingen/alle-nachrichten',
'Bad Brückenau' => 'bad-kissingen/bad-brueckenau',
'Bad Kissingen' => 'bad-kissingen/bad-kissingen',
'Hammelburg' => 'bad-kissingen/hammelburg',
'Münnerstadt' => 'bad-kissingen/muennerstadt',
],
'Hassberge' => [
'Alle Nachrichten' => 'hassberge/alle-nachrichten',
],
'Kitzingen' => [
'Alle Nachrichten' => 'kitzingen/alle-nachrichten',
],
'Main-Spessart' => [
'Alle Nachrichten' => 'main-spessart/alle-nachrichten',
'Gemünden' => 'main-spessart/gemuenden',
'Karlstadt' => 'main-spessart/karlstadt',
'Lohr' => 'main-spessart/lohr',
'Marktheidenfeld' => 'main-spessart/marktheidenfeld',
],
'Main-Tauber' => [
'Alle Nachrichten' => 'main-tauber/alle-nachrichten',
],
'Rhön-Grabfeld' => [
'Alle Nachrichten' => 'rhoengrabfeld/alle-nachrichten',
'Bad Königshofen' => 'rhoengrabfeld/bad-koenigshofen',
'Bad Neustadt' => 'rhoengrabfeld/bad-neustadt',
'Mellrichstadt' => 'rhoengrabfeld/mellrichstadt',
],
'Schweinfurt' => [
'Alle Nachrichten' => 'schweinfurt/alle-nachrichten',
'Gerolzhofen' => 'schweinfurt/gerolzhofen',
'Schweinfurt' => 'schweinfurt/stadtschweinfurt',
],
'Würzburg' => [
'Alle Nachrichten' => 'wuerzburg/alle-nachrichten',
'Ochsenfurt' => 'wuerzburg/ochsenfurt',
'Würzburg' => 'wuerzburg/stadtwuerzburg',
],
]
]
]
];
private function extractArticlesFromJSON($html) {
$articles = [];
$debugInfo = '';
// Debug: Speichere die ersten 1000 Zeichen des HTML
$debugInfo .= "HTML sample (first 1000 chars): " . substr($html, 0, 1000) . "\n\n";
// Verschiedene Muster für JSON-LD Daten testen
$patterns = [
'/<script type="application\/ld\+json">\s*(.*?)\s*<\/script>/s',
'/<script type="application\/ld\+json" id="[^"]*">\s*(.*?)\s*<\/script>/s',
'/<script type=\'application\/ld\+json\'>\s*(.*?)\s*<\/script>/s'
];
foreach ($patterns as $pattern) {
if (preg_match_all($pattern, $html, $matches)) {
$debugInfo .= "JSON Pattern matched: " . $pattern . "\n";
$debugInfo .= "Found " . count($matches[1]) . " JSON blocks\n";
foreach ($matches[1] as $index => $jsonStr) {
$debugInfo .= "JSON block $index (first 300 chars): " . substr($jsonStr, 0, 300) . "...\n";
$data = json_decode($jsonStr, true);
if (json_last_error() !== JSON_ERROR_NONE) {
$debugInfo .= "JSON Error in block $index: " . json_last_error_msg() . "\n";
continue;
}
$debugInfo .= "JSON structure: " . print_r(array_keys($data), true) . "\n";
// NEU: Wenn $data ein numerisch indiziertes Array ist, alle Elemente prüfen
if (is_array($data) && array_keys($data) === range(0, count($data) - 1)) {
foreach ($data as $subIndex => $subData) {
// Prüfe auf hasPart
if (isset($subData['hasPart']) && is_array($subData['hasPart'])) {
$debugInfo .= "Found hasPart with " . count($subData['hasPart']) . " items in block $index/$subIndex\n";
foreach ($subData['hasPart'] as $article) {
if (isset($article['@type']) && $article['@type'] === 'NewsArticle') {
if (isset($article['image']) && !is_array($article['image'])) {
$imageUrl = $article['image'];
$article['image'] = [
'@type' => 'ImageObject',
'url' => $imageUrl
];
}
$articles[] = $article;
$debugInfo .= "Added article from hasPart: " . $article['headline'] . "\n";
$debugInfo .= "Article structure: " . print_r($article, true) . "\n";
}
}
} elseif (isset($subData['@type']) && $subData['@type'] === 'NewsArticle') {
if (isset($subData['image']) && !is_array($subData['image'])) {
$imageUrl = $subData['image'];
$subData['image'] = [
'@type' => 'ImageObject',
'url' => $imageUrl
];
}
$debugInfo .= "Found direct NewsArticle in block $index/$subIndex\n";
$articles[] = $subData;
}
}
} else {
// Ursprüngliche Logik für assoziative Arrays
if (isset($data['hasPart']) && is_array($data['hasPart'])) {
$debugInfo .= "Found hasPart with " . count($data['hasPart']) . " items in block $index\n";
foreach ($data['hasPart'] as $article) {
if (isset($article['@type']) && $article['@type'] === 'NewsArticle') {
if (isset($article['image']) && !is_array($article['image'])) {
$imageUrl = $article['image'];
$article['image'] = [
'@type' => 'ImageObject',
'url' => $imageUrl
];
}
$articles[] = $article;
$debugInfo .= "Added article from hasPart: " . $article['headline'] . "\n";
$debugInfo .= "Article structure: " . print_r($article, true) . "\n";
}
}
} else if (isset($data['@type']) && $data['@type'] === 'NewsArticle') {
if (isset($data['image']) && !is_array($data['image'])) {
$imageUrl = $data['image'];
$data['image'] = [
'@type' => 'ImageObject',
'url' => $imageUrl
];
}
$debugInfo .= "Found direct NewsArticle in block $index\n";
$articles[] = $data;
} else if (isset($data[0]) && isset($data[0]['@type']) && $data[0]['@type'] === 'NewsArticle') {
$debugInfo .= "Found array of NewsArticles in block $index\n";
foreach ($data as $article) {
if (isset($article['@type']) && $article['@type'] === 'NewsArticle') {
if (isset($article['image']) && !is_array($article['image'])) {
$imageUrl = $article['image'];
$article['image'] = [
'@type' => 'ImageObject',
'url' => $imageUrl
];
}
$articles[] = $article;
}
}
}
}
}
}
}
// Speichern der Debug-Informationen
$this->debugJsonInfo = $debugInfo;
return $articles;
}
// Alternative Methode zur Extraktion von Artikeln direkt aus HTML
private function extractArticlesFromHTML($html) {
$articles = [];
$debugInfo = '';
$dom = $html;
// Verschiedene Artikel-Selektoren probieren
$selectors = [
'div.teaser, article',
'.teaser-box',
'.article-teaser',
'.article-list-item',
'.article-card',
'.content-teaser'
];
foreach ($selectors as $selector) {
$teasers = $dom->find($selector);
$debugInfo .= "Selector '$selector' found " . count($teasers) . " elements\n";
if (count($teasers) > 0) {
foreach ($teasers as $teaser) {
$article = [];
// Verschiedene Selektoren für Überschriften probieren
$headlineSelectors = ['a.headline-link', 'h2 a', 'h3 a', '.headline a', '.title a', 'a.title', 'h4 a', '.teaser-headline a'];
$headlineLink = null;
foreach ($headlineSelectors as $headlineSelector) {
$headlineLink = $teaser->find($headlineSelector, 0);
if ($headlineLink) {
$debugInfo .= "Found headline with selector: $headlineSelector\n";
break;
}
}
if ($headlineLink) {
$article['headline'] = trim($headlineLink->plaintext);
$article['url'] = $headlineLink->href;
// Relative URLs zu absoluten machen
if (strpos($article['url'], 'http') !== 0) {
$article['url'] = self::URI . ltrim($article['url'], '/');
}
// Bild suchen mit verschiedenen Methoden
$imgFound = false;
// 1. Direktes Bild im Teaser
$img = $teaser->find('img', 0);
if ($img && isset($img->src) && !empty($img->src)) {
$article['image'] = [
'@type' => 'ImageObject',
'url' => $img->src
];
$imgFound = true;
}
// 2. Bild im Figure-Tag
if (!$imgFound) {
$figure = $teaser->find('figure', 0);
if ($figure) {
$img = $figure->find('img', 0);
if ($img && isset($img->src) && !empty($img->src)) {
$article['image'] = [
'@type' => 'ImageObject',
'url' => $img->src
];
$imgFound = true;
}
}
}
// 3. Hintergrund-Bild im Style-Attribut
if (!$imgFound) {
$divWithStyle = $teaser->find('div[style*="background-image"]', 0);
if ($divWithStyle) {
if (preg_match('/background-image:\s*url\([\'"]?(.*?)[\'"]?\)/i', $divWithStyle->style, $matches)) {
$article['image'] = [
'@type' => 'ImageObject',
'url' => $matches[1]
];
}
}
}
// Datum suchen
$timeSelectors = ['time', '.date', '.pubdate', '.time', '.timestamp'];
foreach ($timeSelectors as $timeSelector) {
$date = $teaser->find($timeSelector, 0);
if ($date) {
if (isset($date->datetime)) {
$article['datePublished'] = $date->datetime;
} else {
$article['datePublished'] = trim($date->plaintext);
}
break;
}
}
// Wenn kein Datum gefunden wird, aktuelles Datum verwenden
if (!isset($article['datePublished'])) {
$article['datePublished'] = date('Y-m-d\TH:i:s\Z');
}
$articles[] = $article;
}
}
// Wenn Artikel gefunden wurden, mit diesem Selektor aufhören
if (count($articles) > 0) {
$debugInfo .= "Found " . count($articles) . " articles with selector '$selector'. Stopping search.\n";
break;
}
}
}
// Speichern der Debug-Informationen
$this->debugHtmlInfo = $debugInfo;
return $articles;
}
public function collectData() {
$ort = $this->getInput('ort');
// Erstelle URL basierend auf Ort
if (empty($ort)) {
// Wenn kein Ort ausgewählt wurde, nur die Region anzeigen
$url = self::URI . $ort;
} else {
// Wenn ein Ort ausgewählt wurde, die spezifische URL erstellen
$url = self::URI . $ort;
}
$html = getSimpleHTMLDOM($url);
if (!$html) {
throw new \Exception('Die Mainpost-Seite konnte nicht geladen werden: ' . $url);
}
// Debug-Info initialisieren
$debugInfo = "URL: " . $url . "\n";
// Save HTML content for debugging
$htmlContent = $html->save();
// Versuche Artikel aus JSON zu extrahieren
$articles = $this->extractArticlesFromJSON($htmlContent);
$debugInfo .= "JSON Extraction Info:\n" . $this->debugJsonInfo . "\n";
// Debug: Anzahl der aus JSON extrahierten Artikel und Beispielstruktur
$debugInfo .= "Number of articles extracted from JSON: " . count($articles) . "\n";
if (!empty($articles)) {
$debugInfo .= "First article structure (JSON): " . print_r($articles[0], true) . "\n";
}
// Wenn keine Artikel aus JSON gefunden wurden, Fehler werfen
if (empty($articles)) {
$htmlSnippet = substr($htmlContent, 0, 500) . '... [truncated] ...' . substr($htmlContent, -500);
$debugInfo .= "HTML Sample:\n" . $htmlSnippet . "\n";
throw new \Exception('Keine Artikel im JSON-LD gefunden. Debug-Info: ' . $debugInfo);
}
// Verarbeite die gefundenen Artikel
foreach ($articles as $article) {
$item = [];
if (!isset($article['headline']) || !isset($article['url'])) {
continue; // Überspringe unvollständige Artikel
}
$item['title'] = $article['headline'];
$item['uri'] = $article['url'];
// Debug: Zeige die Artikelstruktur
$debugInfo .= "Processing article: " . print_r($article, true) . "\n";
// Korrekte Timestamp-Verarbeitung für ISO 8601 Datum (2025-04-25T15:00:00Z)
if (isset($article['datePublished'])) {
$timestamp = strtotime($article['datePublished']);
// Prüfen, ob das Parsen erfolgreich war
if ($timestamp !== false) {
$item['timestamp'] = $timestamp;
} else {
// Bei ungültigen Formaten aktuelles Datum verwenden
$item['timestamp'] = time();
}
} else {
$item['timestamp'] = time();
}
// Description hinzufügen falls vorhanden
if (isset($article['description'])) {
$item['content'] = '<p>' . $article['description'] . '</p>';
} else {
$item['content'] = '';
}
// Bild korrekt hinzufügen
if (isset($article['image'])) {
// Immer auf das Objekt prüfen und nur image['url'] verwenden, falls vorhanden
if (is_array($article['image']) && isset($article['image']['url'])) {
$imageUrl = $article['image']['url'];
$imageUrl = str_replace('\\/', '/', $imageUrl);
$item['content'] .= '<p><img src="' . $imageUrl . '" alt="' . $item['title'] . '"></p>';
$item['enclosures'] = [$imageUrl];
} elseif (is_string($article['image'])) {
// Fallback: falls das Bild nur als String vorliegt
$imageUrl = str_replace('\\/', '/', $article['image']);
$item['content'] .= '<p><img src="' . $imageUrl . '" alt="' . $item['title'] . '"></p>';
$item['enclosures'] = [$imageUrl];
}
}
// Autor hinzufügen (falls verfügbar)
if (isset($article['author'])) {
if (is_array($article['author'])) {
if (isset($article['author']['name'])) {
$item['author'] = $article['author']['name'];
}
} else {
$item['author'] = $article['author'];
}
} else {
$item['author'] = 'Main-Post';
}
// Kategorien hinzufügen
$categories = [];
if (preg_match('/\/([^\/]+)\/[^\/]+\/[^\/]+$/', $article['url'], $matches)) {
$categories[] = $matches[1];
}
$item['categories'] = $categories;
$this->items[] = $item;
}
// Deduplizierung von Artikeln basierend auf URL
$this->items = array_map("unserialize", array_unique(array_map("serialize", $this->items)));
}
}