Kompletter Rewrite
This commit is contained in:
307
serien_checker/scraper/browser_scraper.py
Normal file
307
serien_checker/scraper/browser_scraper.py
Normal file
@@ -0,0 +1,307 @@
|
||||
"""
|
||||
Browser-based scraper for fernsehserien.de using browser-tools
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from datetime import datetime
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ..database.models import SeasonType, DatePreference
|
||||
from ..utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapedEpisode:
|
||||
"""Scraped episode data"""
|
||||
episode_code: str
|
||||
title: str
|
||||
episode_number: Optional[int] = None # Overall episode number from fernsehserien.de
|
||||
episode_id: Optional[str] = None # Episode ID from fernsehserien.de URL (e.g., "1828679")
|
||||
date_de_tv: Optional[datetime] = None
|
||||
date_de_streaming: Optional[datetime] = None
|
||||
date_de_home_media: Optional[datetime] = None
|
||||
date_de_sync: Optional[datetime] = None
|
||||
date_original: Optional[datetime] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapedSeason:
|
||||
"""Scraped season data"""
|
||||
name: str
|
||||
season_type: SeasonType
|
||||
sort_order: int
|
||||
episodes: List[ScrapedEpisode]
|
||||
|
||||
|
||||
class BrowserScraper:
|
||||
"""
|
||||
Scraper for fernsehserien.de using browser automation
|
||||
|
||||
This class uses the browser-tools skill to interact with web pages
|
||||
and extract structured data from the DOM.
|
||||
"""
|
||||
|
||||
BASE_URL = "https://www.fernsehserien.de"
|
||||
|
||||
def __init__(self, browser_page=None):
|
||||
"""
|
||||
Initialize scraper
|
||||
|
||||
Args:
|
||||
browser_page: Browser page instance from browser-tools skill
|
||||
"""
|
||||
self.page = browser_page
|
||||
|
||||
@staticmethod
|
||||
def extract_series_slug(url: str) -> str:
|
||||
"""Extract series slug from URL"""
|
||||
# https://www.fernsehserien.de/black-mirror/episodenguide -> black-mirror
|
||||
match = re.search(r'fernsehserien\.de/([^/]+)', url)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
@staticmethod
|
||||
def parse_german_date(date_str: str) -> Optional[datetime]:
|
||||
"""
|
||||
Parse German date format to datetime
|
||||
|
||||
Supports formats:
|
||||
- DD.MM.YYYY
|
||||
- DD.MM.YY
|
||||
- YYYY
|
||||
"""
|
||||
if not date_str or date_str.strip() == "":
|
||||
return None
|
||||
|
||||
date_str = date_str.strip()
|
||||
|
||||
# Try DD.MM.YYYY or DD.MM.YY
|
||||
patterns = [
|
||||
r'(\d{1,2})\.(\d{1,2})\.(\d{4})', # DD.MM.YYYY
|
||||
r'(\d{1,2})\.(\d{1,2})\.(\d{2})', # DD.MM.YY
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, date_str)
|
||||
if match:
|
||||
day, month, year = match.groups()
|
||||
if len(year) == 2:
|
||||
year = f"20{year}"
|
||||
try:
|
||||
return datetime(int(year), int(month), int(day))
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Try just year (YYYY)
|
||||
year_match = re.search(r'\b(19\d{2}|20\d{2})\b', date_str)
|
||||
if year_match:
|
||||
try:
|
||||
return datetime(int(year_match.group(1)), 1, 1)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def classify_season_type(season_name: str) -> SeasonType:
|
||||
"""
|
||||
Classify season type based on name
|
||||
|
||||
Args:
|
||||
season_name: Season name (e.g., "Staffel 1", "Specials", "2022")
|
||||
|
||||
Returns:
|
||||
SeasonType enum value
|
||||
"""
|
||||
name_lower = season_name.lower()
|
||||
|
||||
# Check for specials
|
||||
if any(keyword in name_lower for keyword in ['special', 'specials']):
|
||||
return SeasonType.SPECIALS
|
||||
|
||||
# Check for extras
|
||||
if any(keyword in name_lower for keyword in ['extra', 'extras', 'bonus']):
|
||||
return SeasonType.EXTRAS
|
||||
|
||||
# Check for best-of
|
||||
if any(keyword in name_lower for keyword in ['best', 'best-of', 'best of']):
|
||||
return SeasonType.BEST_OF
|
||||
|
||||
# Check for year-based (e.g., "2021", "2022")
|
||||
if re.match(r'^(19|20)\d{2}$', season_name.strip()):
|
||||
return SeasonType.YEAR_BASED
|
||||
|
||||
# Default to normal
|
||||
return SeasonType.NORMAL
|
||||
|
||||
@staticmethod
|
||||
def extract_episode_code(episode_text: str) -> str:
|
||||
"""
|
||||
Extract episode code from text
|
||||
|
||||
Examples:
|
||||
- "1. Folge" -> "01"
|
||||
- "1a. Teil A" -> "01a"
|
||||
- "12b. Teil B" -> "12b"
|
||||
"""
|
||||
# Match patterns like "1.", "12a.", "5b."
|
||||
match = re.search(r'^(\d+[a-z]?)\.', episode_text.strip())
|
||||
if match:
|
||||
code = match.group(1)
|
||||
# Pad single digits
|
||||
if code.isdigit():
|
||||
return code.zfill(2)
|
||||
# Handle "1a" -> "01a"
|
||||
elif len(code) >= 2 and code[:-1].isdigit():
|
||||
return code[:-1].zfill(2) + code[-1]
|
||||
return "00"
|
||||
|
||||
def scrape_series(self, url: str) -> Tuple[str, List[ScrapedSeason]]:
|
||||
"""
|
||||
Scrape series data from fernsehserien.de
|
||||
|
||||
Args:
|
||||
url: Full URL to episode guide
|
||||
|
||||
Returns:
|
||||
Tuple of (series_title, list of ScrapedSeason)
|
||||
"""
|
||||
logger.info(f"Scraping series from {url}")
|
||||
|
||||
# Use the BeautifulSoup-based scraper
|
||||
try:
|
||||
from .fernsehserien_scraper import FernsehserienScraper
|
||||
scraper = FernsehserienScraper()
|
||||
return scraper.scrape_series(url)
|
||||
except Exception as e:
|
||||
logger.error(f"Error scraping series: {e}")
|
||||
return "Unknown Series", []
|
||||
|
||||
def scrape_season_episodes(self, season_url: str) -> List[ScrapedEpisode]:
|
||||
"""
|
||||
Scrape episodes for a specific season
|
||||
|
||||
Args:
|
||||
season_url: URL to season page
|
||||
|
||||
Returns:
|
||||
List of ScrapedEpisode objects
|
||||
"""
|
||||
logger.info(f"Scraping season from {season_url}")
|
||||
|
||||
# Placeholder - to be implemented with browser-tools
|
||||
episodes = []
|
||||
|
||||
return episodes
|
||||
|
||||
|
||||
class SeriesUpdater:
|
||||
"""
|
||||
Handles delta updates for series data
|
||||
"""
|
||||
|
||||
def __init__(self, db_manager, scraper: BrowserScraper, progress_callback=None):
|
||||
"""
|
||||
Initialize updater
|
||||
|
||||
Args:
|
||||
db_manager: DatabaseManager instance
|
||||
scraper: BrowserScraper instance
|
||||
progress_callback: Optional callback for progress updates (percent, message)
|
||||
"""
|
||||
self.db = db_manager
|
||||
self.scraper = scraper
|
||||
self.progress_callback = progress_callback
|
||||
self.logger = setup_logger()
|
||||
|
||||
def _report_progress(self, percent: int, message: str):
|
||||
"""Report progress if callback is set"""
|
||||
if self.progress_callback:
|
||||
self.progress_callback(percent, message)
|
||||
self.logger.info(message)
|
||||
|
||||
def update_series(self, series_id: int) -> Dict[str, int]:
|
||||
"""
|
||||
Update a series by clearing old data and re-scraping
|
||||
|
||||
Args:
|
||||
series_id: Database ID of series to update
|
||||
|
||||
Returns:
|
||||
Dictionary with counts of new/updated/unchanged items
|
||||
"""
|
||||
stats = {
|
||||
'new_seasons': 0,
|
||||
'new_episodes': 0,
|
||||
'updated_episodes': 0,
|
||||
'unchanged': 0
|
||||
}
|
||||
|
||||
series = self.db.get_series(series_id)
|
||||
if not series:
|
||||
self.logger.error(f"Series {series_id} not found")
|
||||
return stats
|
||||
|
||||
self._report_progress(0, f"Aktualisiere: {series.title}")
|
||||
|
||||
# Clear existing seasons and episodes to avoid duplicates
|
||||
self._report_progress(5, "Lösche alte Daten...")
|
||||
self.db.clear_series_data(series_id)
|
||||
|
||||
# Scrape fresh data
|
||||
self._report_progress(10, "Lade Episodenführer...")
|
||||
title, scraped_seasons = self.scraper.scrape_series(series.url)
|
||||
|
||||
# Update series title if it changed
|
||||
if title and title != series.title:
|
||||
series.title = title
|
||||
self.db.update_series(series)
|
||||
|
||||
# Add all seasons and episodes (they're all new since we cleared the data)
|
||||
total_seasons = len(scraped_seasons)
|
||||
for idx, scraped_season in enumerate(scraped_seasons):
|
||||
# Calculate progress: 10-90% for scraping seasons
|
||||
progress = 10 + int((idx / total_seasons) * 80) if total_seasons > 0 else 10
|
||||
self._report_progress(progress, f"Speichere Staffel: {scraped_season.name}")
|
||||
|
||||
# Add new season
|
||||
from ..database.models import Season
|
||||
season = Season(
|
||||
id=None,
|
||||
series_id=series_id,
|
||||
name=scraped_season.name,
|
||||
season_type=scraped_season.season_type,
|
||||
sort_order=scraped_season.sort_order
|
||||
)
|
||||
season.id = self.db.add_season(season)
|
||||
stats['new_seasons'] += 1
|
||||
|
||||
# Add all episodes
|
||||
for scraped_ep in scraped_season.episodes:
|
||||
from ..database.models import Episode
|
||||
episode = Episode(
|
||||
id=None,
|
||||
season_id=season.id,
|
||||
episode_number=scraped_ep.episode_number,
|
||||
episode_code=scraped_ep.episode_code,
|
||||
title=scraped_ep.title,
|
||||
episode_id=scraped_ep.episode_id,
|
||||
date_de_tv=scraped_ep.date_de_tv,
|
||||
date_de_streaming=scraped_ep.date_de_streaming,
|
||||
date_de_home_media=scraped_ep.date_de_home_media,
|
||||
date_de_sync=scraped_ep.date_de_sync,
|
||||
date_original=scraped_ep.date_original
|
||||
)
|
||||
episode.comparison_date = episode.calculate_comparison_date(series.date_preference)
|
||||
self.db.add_episode(episode)
|
||||
stats['new_episodes'] += 1
|
||||
|
||||
# Update last_updated timestamp
|
||||
self._report_progress(95, "Schließe Update ab...")
|
||||
series.last_updated = datetime.now()
|
||||
self.db.update_series(series)
|
||||
|
||||
self._report_progress(100, "Fertig!")
|
||||
return stats
|
||||
Reference in New Issue
Block a user