422 lines
20 KiB
PHP
422 lines
20 KiB
PHP
<?php
|
|
class MainPostBridge extends BridgeAbstract {
|
|
const NAME = 'Main-Post Nachrichten';
|
|
const URI = 'https://www.mainpost.de/';
|
|
const DESCRIPTION = 'Nachrichten und Artikel von der Main-Post';
|
|
const MAINTAINER = 'Akamaru';
|
|
const CACHE_TIMEOUT = 3600; // 1 Stunde
|
|
|
|
public function getIcon()
|
|
{
|
|
return 'https://www.google.com/s2/favicons?domain=www.mainpost.de&sz=32';
|
|
}
|
|
const PARAMETERS = [
|
|
'Regionen' => [
|
|
'ort' => [
|
|
'name' => 'Ort',
|
|
'type' => 'list',
|
|
'title' => 'Wähle den Ort',
|
|
'required' => true,
|
|
'values' => [
|
|
'Alles von Main-Post' => [
|
|
'Alle Nachrichten' => '',
|
|
],
|
|
'Bad Kissingen' => [
|
|
'Alle Nachrichten' => 'bad-kissingen/alle-nachrichten',
|
|
'Bad Brückenau' => 'bad-kissingen/bad-brueckenau',
|
|
'Bad Kissingen' => 'bad-kissingen/bad-kissingen',
|
|
'Hammelburg' => 'bad-kissingen/hammelburg',
|
|
'Münnerstadt' => 'bad-kissingen/muennerstadt',
|
|
],
|
|
'Hassberge' => [
|
|
'Alle Nachrichten' => 'hassberge/alle-nachrichten',
|
|
],
|
|
'Kitzingen' => [
|
|
'Alle Nachrichten' => 'kitzingen/alle-nachrichten',
|
|
],
|
|
'Main-Spessart' => [
|
|
'Alle Nachrichten' => 'main-spessart/alle-nachrichten',
|
|
'Gemünden' => 'main-spessart/gemuenden',
|
|
'Karlstadt' => 'main-spessart/karlstadt',
|
|
'Lohr' => 'main-spessart/lohr',
|
|
'Marktheidenfeld' => 'main-spessart/marktheidenfeld',
|
|
],
|
|
'Main-Tauber' => [
|
|
'Alle Nachrichten' => 'main-tauber/alle-nachrichten',
|
|
],
|
|
'Rhön-Grabfeld' => [
|
|
'Alle Nachrichten' => 'rhoengrabfeld/alle-nachrichten',
|
|
'Bad Königshofen' => 'rhoengrabfeld/bad-koenigshofen',
|
|
'Bad Neustadt' => 'rhoengrabfeld/bad-neustadt',
|
|
'Mellrichstadt' => 'rhoengrabfeld/mellrichstadt',
|
|
],
|
|
'Schweinfurt' => [
|
|
'Alle Nachrichten' => 'schweinfurt/alle-nachrichten',
|
|
'Gerolzhofen' => 'schweinfurt/gerolzhofen',
|
|
'Schweinfurt' => 'schweinfurt/stadtschweinfurt',
|
|
],
|
|
'Würzburg' => [
|
|
'Alle Nachrichten' => 'wuerzburg/alle-nachrichten',
|
|
'Ochsenfurt' => 'wuerzburg/ochsenfurt',
|
|
'Würzburg' => 'wuerzburg/stadtwuerzburg',
|
|
],
|
|
]
|
|
]
|
|
]
|
|
];
|
|
|
|
private function extractArticlesFromJSON($html) {
|
|
$articles = [];
|
|
$debugInfo = '';
|
|
|
|
// Debug: Speichere die ersten 1000 Zeichen des HTML
|
|
$debugInfo .= "HTML sample (first 1000 chars): " . substr($html, 0, 1000) . "\n\n";
|
|
|
|
// Verschiedene Muster für JSON-LD Daten testen
|
|
$patterns = [
|
|
'/<script type="application\/ld\+json">\s*(.*?)\s*<\/script>/s',
|
|
'/<script type="application\/ld\+json" id="[^"]*">\s*(.*?)\s*<\/script>/s',
|
|
'/<script type=\'application\/ld\+json\'>\s*(.*?)\s*<\/script>/s'
|
|
];
|
|
|
|
foreach ($patterns as $pattern) {
|
|
if (preg_match_all($pattern, $html, $matches)) {
|
|
$debugInfo .= "JSON Pattern matched: " . $pattern . "\n";
|
|
$debugInfo .= "Found " . count($matches[1]) . " JSON blocks\n";
|
|
|
|
foreach ($matches[1] as $index => $jsonStr) {
|
|
$debugInfo .= "JSON block $index (first 300 chars): " . substr($jsonStr, 0, 300) . "...\n";
|
|
$data = json_decode($jsonStr, true);
|
|
if (json_last_error() !== JSON_ERROR_NONE) {
|
|
$debugInfo .= "JSON Error in block $index: " . json_last_error_msg() . "\n";
|
|
continue;
|
|
}
|
|
$debugInfo .= "JSON structure: " . print_r(array_keys($data), true) . "\n";
|
|
|
|
// NEU: Wenn $data ein numerisch indiziertes Array ist, alle Elemente prüfen
|
|
if (is_array($data) && array_keys($data) === range(0, count($data) - 1)) {
|
|
foreach ($data as $subIndex => $subData) {
|
|
// Prüfe auf hasPart
|
|
if (isset($subData['hasPart']) && is_array($subData['hasPart'])) {
|
|
$debugInfo .= "Found hasPart with " . count($subData['hasPart']) . " items in block $index/$subIndex\n";
|
|
foreach ($subData['hasPart'] as $article) {
|
|
if (isset($article['@type']) && $article['@type'] === 'NewsArticle') {
|
|
if (isset($article['image']) && !is_array($article['image'])) {
|
|
$imageUrl = $article['image'];
|
|
$article['image'] = [
|
|
'@type' => 'ImageObject',
|
|
'url' => $imageUrl
|
|
];
|
|
}
|
|
$articles[] = $article;
|
|
$debugInfo .= "Added article from hasPart: " . $article['headline'] . "\n";
|
|
$debugInfo .= "Article structure: " . print_r($article, true) . "\n";
|
|
}
|
|
}
|
|
} elseif (isset($subData['@type']) && $subData['@type'] === 'NewsArticle') {
|
|
if (isset($subData['image']) && !is_array($subData['image'])) {
|
|
$imageUrl = $subData['image'];
|
|
$subData['image'] = [
|
|
'@type' => 'ImageObject',
|
|
'url' => $imageUrl
|
|
];
|
|
}
|
|
$debugInfo .= "Found direct NewsArticle in block $index/$subIndex\n";
|
|
$articles[] = $subData;
|
|
}
|
|
}
|
|
} else {
|
|
// Ursprüngliche Logik für assoziative Arrays
|
|
if (isset($data['hasPart']) && is_array($data['hasPart'])) {
|
|
$debugInfo .= "Found hasPart with " . count($data['hasPart']) . " items in block $index\n";
|
|
foreach ($data['hasPart'] as $article) {
|
|
if (isset($article['@type']) && $article['@type'] === 'NewsArticle') {
|
|
if (isset($article['image']) && !is_array($article['image'])) {
|
|
$imageUrl = $article['image'];
|
|
$article['image'] = [
|
|
'@type' => 'ImageObject',
|
|
'url' => $imageUrl
|
|
];
|
|
}
|
|
$articles[] = $article;
|
|
$debugInfo .= "Added article from hasPart: " . $article['headline'] . "\n";
|
|
$debugInfo .= "Article structure: " . print_r($article, true) . "\n";
|
|
}
|
|
}
|
|
} else if (isset($data['@type']) && $data['@type'] === 'NewsArticle') {
|
|
if (isset($data['image']) && !is_array($data['image'])) {
|
|
$imageUrl = $data['image'];
|
|
$data['image'] = [
|
|
'@type' => 'ImageObject',
|
|
'url' => $imageUrl
|
|
];
|
|
}
|
|
$debugInfo .= "Found direct NewsArticle in block $index\n";
|
|
$articles[] = $data;
|
|
} else if (isset($data[0]) && isset($data[0]['@type']) && $data[0]['@type'] === 'NewsArticle') {
|
|
$debugInfo .= "Found array of NewsArticles in block $index\n";
|
|
foreach ($data as $article) {
|
|
if (isset($article['@type']) && $article['@type'] === 'NewsArticle') {
|
|
if (isset($article['image']) && !is_array($article['image'])) {
|
|
$imageUrl = $article['image'];
|
|
$article['image'] = [
|
|
'@type' => 'ImageObject',
|
|
'url' => $imageUrl
|
|
];
|
|
}
|
|
$articles[] = $article;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Speichern der Debug-Informationen
|
|
$this->debugJsonInfo = $debugInfo;
|
|
|
|
return $articles;
|
|
}
|
|
|
|
// Alternative Methode zur Extraktion von Artikeln direkt aus HTML
|
|
private function extractArticlesFromHTML($html) {
|
|
$articles = [];
|
|
$debugInfo = '';
|
|
$dom = $html;
|
|
|
|
// Verschiedene Artikel-Selektoren probieren
|
|
$selectors = [
|
|
'div.teaser, article',
|
|
'.teaser-box',
|
|
'.article-teaser',
|
|
'.article-list-item',
|
|
'.article-card',
|
|
'.content-teaser'
|
|
];
|
|
|
|
foreach ($selectors as $selector) {
|
|
$teasers = $dom->find($selector);
|
|
$debugInfo .= "Selector '$selector' found " . count($teasers) . " elements\n";
|
|
|
|
if (count($teasers) > 0) {
|
|
foreach ($teasers as $teaser) {
|
|
$article = [];
|
|
|
|
// Verschiedene Selektoren für Überschriften probieren
|
|
$headlineSelectors = ['a.headline-link', 'h2 a', 'h3 a', '.headline a', '.title a', 'a.title', 'h4 a', '.teaser-headline a'];
|
|
$headlineLink = null;
|
|
|
|
foreach ($headlineSelectors as $headlineSelector) {
|
|
$headlineLink = $teaser->find($headlineSelector, 0);
|
|
if ($headlineLink) {
|
|
$debugInfo .= "Found headline with selector: $headlineSelector\n";
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ($headlineLink) {
|
|
$article['headline'] = trim($headlineLink->plaintext);
|
|
$article['url'] = $headlineLink->href;
|
|
|
|
// Relative URLs zu absoluten machen
|
|
if (strpos($article['url'], 'http') !== 0) {
|
|
$article['url'] = self::URI . ltrim($article['url'], '/');
|
|
}
|
|
|
|
// Bild suchen mit verschiedenen Methoden
|
|
$imgFound = false;
|
|
|
|
// 1. Direktes Bild im Teaser
|
|
$img = $teaser->find('img', 0);
|
|
if ($img && isset($img->src) && !empty($img->src)) {
|
|
$article['image'] = [
|
|
'@type' => 'ImageObject',
|
|
'url' => $img->src
|
|
];
|
|
$imgFound = true;
|
|
}
|
|
|
|
// 2. Bild im Figure-Tag
|
|
if (!$imgFound) {
|
|
$figure = $teaser->find('figure', 0);
|
|
if ($figure) {
|
|
$img = $figure->find('img', 0);
|
|
if ($img && isset($img->src) && !empty($img->src)) {
|
|
$article['image'] = [
|
|
'@type' => 'ImageObject',
|
|
'url' => $img->src
|
|
];
|
|
$imgFound = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
// 3. Hintergrund-Bild im Style-Attribut
|
|
if (!$imgFound) {
|
|
$divWithStyle = $teaser->find('div[style*="background-image"]', 0);
|
|
if ($divWithStyle) {
|
|
if (preg_match('/background-image:\s*url\([\'"]?(.*?)[\'"]?\)/i', $divWithStyle->style, $matches)) {
|
|
$article['image'] = [
|
|
'@type' => 'ImageObject',
|
|
'url' => $matches[1]
|
|
];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Datum suchen
|
|
$timeSelectors = ['time', '.date', '.pubdate', '.time', '.timestamp'];
|
|
foreach ($timeSelectors as $timeSelector) {
|
|
$date = $teaser->find($timeSelector, 0);
|
|
if ($date) {
|
|
if (isset($date->datetime)) {
|
|
$article['datePublished'] = $date->datetime;
|
|
} else {
|
|
$article['datePublished'] = trim($date->plaintext);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Wenn kein Datum gefunden wird, aktuelles Datum verwenden
|
|
if (!isset($article['datePublished'])) {
|
|
$article['datePublished'] = date('Y-m-d\TH:i:s\Z');
|
|
}
|
|
|
|
$articles[] = $article;
|
|
}
|
|
}
|
|
|
|
// Wenn Artikel gefunden wurden, mit diesem Selektor aufhören
|
|
if (count($articles) > 0) {
|
|
$debugInfo .= "Found " . count($articles) . " articles with selector '$selector'. Stopping search.\n";
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Speichern der Debug-Informationen
|
|
$this->debugHtmlInfo = $debugInfo;
|
|
|
|
return $articles;
|
|
}
|
|
|
|
public function collectData() {
|
|
$ort = $this->getInput('ort');
|
|
|
|
// Erstelle URL basierend auf Ort
|
|
if (empty($ort)) {
|
|
// Wenn kein Ort ausgewählt wurde, nur die Region anzeigen
|
|
$url = self::URI . $ort;
|
|
} else {
|
|
// Wenn ein Ort ausgewählt wurde, die spezifische URL erstellen
|
|
$url = self::URI . $ort;
|
|
}
|
|
|
|
$html = getSimpleHTMLDOM($url);
|
|
if (!$html) {
|
|
throw new \Exception('Die Mainpost-Seite konnte nicht geladen werden: ' . $url);
|
|
}
|
|
|
|
// Debug-Info initialisieren
|
|
$debugInfo = "URL: " . $url . "\n";
|
|
|
|
// Save HTML content for debugging
|
|
$htmlContent = $html->save();
|
|
|
|
// Versuche Artikel aus JSON zu extrahieren
|
|
$articles = $this->extractArticlesFromJSON($htmlContent);
|
|
$debugInfo .= "JSON Extraction Info:\n" . $this->debugJsonInfo . "\n";
|
|
|
|
// Debug: Anzahl der aus JSON extrahierten Artikel und Beispielstruktur
|
|
$debugInfo .= "Number of articles extracted from JSON: " . count($articles) . "\n";
|
|
if (!empty($articles)) {
|
|
$debugInfo .= "First article structure (JSON): " . print_r($articles[0], true) . "\n";
|
|
}
|
|
|
|
// Wenn keine Artikel aus JSON gefunden wurden, Fehler werfen
|
|
if (empty($articles)) {
|
|
$htmlSnippet = substr($htmlContent, 0, 500) . '... [truncated] ...' . substr($htmlContent, -500);
|
|
$debugInfo .= "HTML Sample:\n" . $htmlSnippet . "\n";
|
|
throw new \Exception('Keine Artikel im JSON-LD gefunden. Debug-Info: ' . $debugInfo);
|
|
}
|
|
|
|
// Verarbeite die gefundenen Artikel
|
|
foreach ($articles as $article) {
|
|
$item = [];
|
|
|
|
if (!isset($article['headline']) || !isset($article['url'])) {
|
|
continue; // Überspringe unvollständige Artikel
|
|
}
|
|
|
|
$item['title'] = $article['headline'];
|
|
$item['uri'] = $article['url'];
|
|
|
|
// Debug: Zeige die Artikelstruktur
|
|
$debugInfo .= "Processing article: " . print_r($article, true) . "\n";
|
|
|
|
// Korrekte Timestamp-Verarbeitung für ISO 8601 Datum (2025-04-25T15:00:00Z)
|
|
if (isset($article['datePublished'])) {
|
|
$timestamp = strtotime($article['datePublished']);
|
|
// Prüfen, ob das Parsen erfolgreich war
|
|
if ($timestamp !== false) {
|
|
$item['timestamp'] = $timestamp;
|
|
} else {
|
|
// Bei ungültigen Formaten aktuelles Datum verwenden
|
|
$item['timestamp'] = time();
|
|
}
|
|
} else {
|
|
$item['timestamp'] = time();
|
|
}
|
|
|
|
// Description hinzufügen falls vorhanden
|
|
if (isset($article['description'])) {
|
|
$item['content'] = '<p>' . $article['description'] . '</p>';
|
|
} else {
|
|
$item['content'] = '';
|
|
}
|
|
|
|
// Bild korrekt hinzufügen
|
|
if (isset($article['image'])) {
|
|
// Immer auf das Objekt prüfen und nur image['url'] verwenden, falls vorhanden
|
|
if (is_array($article['image']) && isset($article['image']['url'])) {
|
|
$imageUrl = $article['image']['url'];
|
|
$imageUrl = str_replace('\\/', '/', $imageUrl);
|
|
$item['content'] .= '<p><img src="' . $imageUrl . '" alt="' . $item['title'] . '"></p>';
|
|
$item['enclosures'] = [$imageUrl];
|
|
} elseif (is_string($article['image'])) {
|
|
// Fallback: falls das Bild nur als String vorliegt
|
|
$imageUrl = str_replace('\\/', '/', $article['image']);
|
|
$item['content'] .= '<p><img src="' . $imageUrl . '" alt="' . $item['title'] . '"></p>';
|
|
$item['enclosures'] = [$imageUrl];
|
|
}
|
|
}
|
|
|
|
// Autor hinzufügen (falls verfügbar)
|
|
if (isset($article['author'])) {
|
|
if (is_array($article['author'])) {
|
|
if (isset($article['author']['name'])) {
|
|
$item['author'] = $article['author']['name'];
|
|
}
|
|
} else {
|
|
$item['author'] = $article['author'];
|
|
}
|
|
} else {
|
|
$item['author'] = 'Main-Post';
|
|
}
|
|
|
|
// Kategorien hinzufügen
|
|
$categories = [];
|
|
if (preg_match('/\/([^\/]+)\/[^\/]+\/[^\/]+$/', $article['url'], $matches)) {
|
|
$categories[] = $matches[1];
|
|
}
|
|
$item['categories'] = $categories;
|
|
|
|
$this->items[] = $item;
|
|
}
|
|
|
|
// Deduplizierung von Artikeln basierend auf URL
|
|
$this->items = array_map("unserialize", array_unique(array_map("serialize", $this->items)));
|
|
}
|
|
}
|