1096 lines
43 KiB
Python
1096 lines
43 KiB
Python
#!/usr/bin/env -S uv run --script
|
|
# /// script
|
|
# requires-python = ">=3.11"
|
|
# dependencies = [
|
|
# "requests",
|
|
# "beautifulsoup4",
|
|
# "lxml",
|
|
# ]
|
|
# ///
|
|
|
|
"""
|
|
Scraper for fernsehserien.de using BeautifulSoup
|
|
This is a standalone scraper that can be used independently
|
|
"""
|
|
|
|
import re
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from typing import List, Dict, Optional, Tuple
|
|
from datetime import datetime
|
|
from dataclasses import dataclass
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add parent to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
from serien_checker.database.models import SeasonType
|
|
from serien_checker.scraper.browser_scraper import ScrapedEpisode, ScrapedSeason
|
|
from serien_checker.utils.logger import setup_logger
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
class FernsehserienScraper:
|
|
"""
|
|
Scraper for fernsehserien.de using requests + BeautifulSoup
|
|
"""
|
|
|
|
BASE_URL = "https://www.fernsehserien.de"
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
}
|
|
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update(self.HEADERS)
|
|
|
|
@staticmethod
|
|
def parse_german_date(date_str: str) -> Optional[datetime]:
|
|
"""
|
|
Parse German date format to datetime
|
|
|
|
Supports formats:
|
|
- DD.MM.YYYY
|
|
- DD.MM.YY
|
|
- YYYY
|
|
"""
|
|
if not date_str or date_str.strip() == "":
|
|
return None
|
|
|
|
date_str = date_str.strip()
|
|
|
|
# Try DD.MM.YYYY or DD.MM.YY
|
|
patterns = [
|
|
(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', '%d.%m.%Y'),
|
|
(r'(\d{1,2})\.(\d{1,2})\.(\d{2})', '%d.%m.%y'),
|
|
]
|
|
|
|
for pattern, fmt in patterns:
|
|
match = re.search(pattern, date_str)
|
|
if match:
|
|
try:
|
|
return datetime.strptime(match.group(0), fmt)
|
|
except ValueError:
|
|
continue
|
|
|
|
# Try just year (YYYY)
|
|
year_match = re.search(r'\b(19\d{2}|20\d{2})\b', date_str)
|
|
if year_match:
|
|
try:
|
|
return datetime(int(year_match.group(1)), 1, 1)
|
|
except ValueError:
|
|
pass
|
|
|
|
return None
|
|
|
|
@staticmethod
|
|
def classify_season_type(season_name: str) -> SeasonType:
|
|
"""Classify season type based on name"""
|
|
name_lower = season_name.lower()
|
|
|
|
if any(keyword in name_lower for keyword in ['special', 'specials']):
|
|
return SeasonType.SPECIALS
|
|
|
|
if any(keyword in name_lower for keyword in ['extra', 'extras', 'bonus']):
|
|
return SeasonType.EXTRAS
|
|
|
|
if any(keyword in name_lower for keyword in ['best', 'best-of', 'best of']):
|
|
return SeasonType.BEST_OF
|
|
|
|
if re.match(r'^(19|20)\d{2}$', season_name.strip()):
|
|
return SeasonType.YEAR_BASED
|
|
|
|
return SeasonType.NORMAL
|
|
|
|
@staticmethod
|
|
def extract_episode_code(episode_text: str) -> str:
|
|
"""
|
|
Extract episode code from text
|
|
Examples: "1. Folge" -> "01", "12a. Teil A" -> "12a"
|
|
"""
|
|
match = re.search(r'^(\d+[a-z]?)\.', episode_text.strip())
|
|
if match:
|
|
code = match.group(1)
|
|
if code.isdigit():
|
|
return code.zfill(2)
|
|
elif len(code) >= 2 and code[:-1].isdigit():
|
|
return code[:-1].zfill(2) + code[-1]
|
|
return "00"
|
|
|
|
def scrape_series(self, url: str) -> Tuple[str, List[ScrapedSeason]]:
|
|
"""
|
|
Scrape series from fernsehserien.de
|
|
|
|
This scraper works in two steps:
|
|
1. Scrape overview page to get season links
|
|
2. Scrape each season page to get episodes
|
|
|
|
Args:
|
|
url: URL to episode guide (overview page)
|
|
|
|
Returns:
|
|
Tuple of (series_title, list of ScrapedSeason)
|
|
"""
|
|
print(f"Scraping overview {url}...")
|
|
|
|
response = self.session.get(url, timeout=15)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'lxml')
|
|
|
|
# Extract series title
|
|
series_title = self._extract_series_title(soup)
|
|
print(f"Serie: {series_title}")
|
|
|
|
# Find season links from the series menu
|
|
season_links = self._extract_season_links(soup, url)
|
|
print(f"Gefunden: {len(season_links)} Staffeln mit eigenen Seiten")
|
|
|
|
# Check if most season names contain "bisher X Folgen" pattern
|
|
# If so, we need to group episodes by code prefix instead
|
|
bisher_pattern = re.compile(r'bisher.*\d+.*Folgen', re.IGNORECASE)
|
|
has_bisher = [bool(bisher_pattern.search(name)) for name, _ in season_links]
|
|
bisher_count = sum(has_bisher)
|
|
mostly_bisher = bisher_count > len(season_links) // 2 and season_links
|
|
|
|
if mostly_bisher:
|
|
# Special case: All seasons have same name (like "bisher 1369 Folgen")
|
|
# Extract and group episodes from overview page by episode code prefix
|
|
print(f"Alle Staffeln heißen '{season_links[0][0]}' - gruppiere nach Episode-Code")
|
|
seasons = self._extract_grouped_from_overview(soup)
|
|
print(f"Gefunden: {len(seasons)} gruppierte Staffeln mit insgesamt {sum(len(s.episodes) for s in seasons)} Episoden")
|
|
return series_title, seasons
|
|
|
|
# Scrape each season from dedicated pages
|
|
# First, check if any season names have duplicates (same base number)
|
|
season_names = [name for name, _ in season_links]
|
|
has_duplicate_seasons = False
|
|
for name in season_names:
|
|
# Extract base season number (e.g., "Staffel 6" from "Staffel 6: Video-Podcast")
|
|
base_match = re.search(r'Staffel\s+(\d+)', name)
|
|
if base_match:
|
|
base_num = base_match.group(1)
|
|
# Count how many seasons have this base number
|
|
count = sum(1 for n in season_names if f'Staffel {base_num}' in n)
|
|
if count > 1:
|
|
has_duplicate_seasons = True
|
|
break
|
|
|
|
seasons = []
|
|
for i, (season_name, season_url) in enumerate(season_links):
|
|
print(f" Lade {season_name}...")
|
|
season = self._scrape_season_page(season_name, season_url, i)
|
|
|
|
# If season has no episodes or episodes have no episode_number,
|
|
# try to extract from overview page instead
|
|
if season and season.episodes:
|
|
# Check if any episode has episode_number
|
|
has_episode_numbers = any(ep.episode_number is not None for ep in season.episodes)
|
|
|
|
# Don't use overview page if there are duplicate season numbers
|
|
# (e.g., "Staffel 6" and "Staffel 6: Video-Podcast")
|
|
# because the overview page can't distinguish between variants with same base number
|
|
skip_overview = has_duplicate_seasons
|
|
|
|
# Also check if episode_numbers look wrong (e.g., starting at 1 for each season)
|
|
# This happens when scraping from <section itemprop="episode"> which only has
|
|
# season-relative numbers, not overall series numbers
|
|
needs_overview = False
|
|
if has_episode_numbers and not skip_overview:
|
|
# If all episodes have numbers 1, 2, 3... but this isn't Staffel 1,
|
|
# the numbers are probably wrong (season-relative instead of series-wide)
|
|
first_ep_num = next((ep.episode_number for ep in season.episodes if ep.episode_number), None)
|
|
if first_ep_num == 1 and i > 0: # i > 0 means not the first season
|
|
needs_overview = True
|
|
|
|
if (not has_episode_numbers or needs_overview) and not skip_overview:
|
|
# Try extracting from overview page (only when safe)
|
|
overview_season = self._extract_season_from_overview(soup, season_name, i)
|
|
if overview_season and overview_season.episodes:
|
|
season = overview_season
|
|
print(f" {len(season.episodes)} Episoden (von Übersichtsseite)")
|
|
else:
|
|
print(f" {len(season.episodes)} Episoden")
|
|
else:
|
|
print(f" {len(season.episodes)} Episoden")
|
|
seasons.append(season)
|
|
elif season:
|
|
# Empty season, try overview page (only when safe)
|
|
if not skip_overview:
|
|
overview_season = self._extract_season_from_overview(soup, season_name, i)
|
|
if overview_season and overview_season.episodes:
|
|
seasons.append(overview_season)
|
|
print(f" {len(overview_season.episodes)} Episoden (von Übersichtsseite)")
|
|
|
|
# Also check for seasons directly on overview page (e.g., Specials)
|
|
overview_seasons = self._extract_seasons_from_overview(soup, len(seasons))
|
|
if overview_seasons:
|
|
# Track existing season names to avoid duplicates
|
|
existing_names = {s.name for s in seasons}
|
|
new_seasons = [s for s in overview_seasons if s.name not in existing_names]
|
|
|
|
if new_seasons:
|
|
print(f"Gefunden: {len(new_seasons)} zusätzliche Staffeln auf Übersichtsseite")
|
|
for season in new_seasons:
|
|
seasons.append(season)
|
|
print(f" {season.name}: {len(season.episodes)} Episoden")
|
|
|
|
return series_title, seasons
|
|
|
|
def _extract_series_title(self, soup: BeautifulSoup) -> str:
|
|
"""Extract series title from page"""
|
|
# Try meta tags first
|
|
og_title = soup.find('meta', property='og:title')
|
|
if og_title and og_title.get('content'):
|
|
title = og_title['content']
|
|
# Remove ": Episodenguide" suffix
|
|
title = re.sub(r':\s*Episodenguide.*$', '', title, flags=re.IGNORECASE)
|
|
return title.strip()
|
|
|
|
# Fallback to h1
|
|
h1 = soup.find('h1')
|
|
if h1:
|
|
return h1.get_text(strip=True)
|
|
|
|
return "Unbekannte Serie"
|
|
|
|
def _extract_season_links(self, soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]:
|
|
"""
|
|
Extract season links from the series menu
|
|
|
|
Returns:
|
|
List of (season_name, season_url) tuples
|
|
"""
|
|
season_links = []
|
|
seen_urls = set()
|
|
|
|
# Collect links from multiple sources
|
|
links = []
|
|
|
|
# 1. Try to find the series menu navigation (newer layout)
|
|
series_menu = soup.find('nav', class_='series-menu')
|
|
if series_menu:
|
|
# Find the episodenguide submenu
|
|
episode_menu = series_menu.find('li', {'data-menu-item': 'episodenguide'})
|
|
if episode_menu:
|
|
# Same pattern as global search - no trailing slash required
|
|
links.extend(episode_menu.find_all('a', href=re.compile(r'episodenguide/(staffel-[^/]+|\d+)')))
|
|
|
|
# 2. Search globally for season links (works for pages without series-menu)
|
|
# Pattern matches: /episodenguide/staffel-1/, /episodenguide/staffel-1/18522, /episodenguide/0/, etc.
|
|
# Note: No trailing slash required - URLs can end with /staffel-1 or /staffel-1/12345
|
|
global_links = soup.find_all('a', href=re.compile(r'episodenguide/(staffel-[^/]+|\d+)'))
|
|
links.extend(global_links)
|
|
|
|
for link in links:
|
|
# Extract season name more robustly
|
|
# First, try to get text from strong/b tags only (ignoring image alt text)
|
|
strong_tag = link.find(['strong', 'b'])
|
|
if strong_tag:
|
|
season_name = strong_tag.get_text(strip=True)
|
|
else:
|
|
# Fallback: get direct text children only (exclude nested elements like img)
|
|
season_name = ''.join(link.find_all(string=True, recursive=False)).strip()
|
|
# If still empty, use full text
|
|
if not season_name:
|
|
season_name = link.get_text(strip=True)
|
|
|
|
# Clean up image captions that might leak through
|
|
season_name = re.sub(r'Bild:\s*[^A-Z]*(?=[A-Z])', '', season_name)
|
|
season_name = re.sub(r'Foto:\s*[^A-Z]*(?=[A-Z])', '', season_name)
|
|
|
|
# Normalize whitespace (convert "Staffel6" or "Staffel 6" to "Staffel 6")
|
|
season_name = ' '.join(season_name.split())
|
|
|
|
season_url = link.get('href', '')
|
|
|
|
# If season name is just "Staffel" without number, try to extract from URL
|
|
if season_name.lower() in ['staffel', 'season']:
|
|
# Try to extract season number from URL like "staffel-6/47453"
|
|
url_match = re.search(r'/staffel-(\d+)', season_url)
|
|
if url_match:
|
|
season_num = url_match.group(1)
|
|
season_name = f"Staffel {season_num}"
|
|
logger.debug(f"Added season number from URL: '{season_name}'")
|
|
|
|
logger.debug(f"Extracted season name: '{season_name}' from link {season_url}")
|
|
|
|
# Skip navigation/anchor links
|
|
if not season_url or season_url.startswith('#'):
|
|
continue
|
|
|
|
if season_url:
|
|
# Make absolute URL
|
|
if season_url.startswith('/'):
|
|
season_url = self.BASE_URL + season_url
|
|
elif not season_url.startswith('http'):
|
|
# Relative URL like "episodenguide/0/28673"
|
|
# Need to combine with base URL path
|
|
from urllib.parse import urljoin
|
|
season_url = urljoin(base_url, season_url)
|
|
|
|
# Skip duplicates (extract staffel identifier for robust comparison)
|
|
# This ignores different series slugs (e.g., nachtstreife-2020 vs nachtstreife-2-0)
|
|
staffel_match = re.search(r'/(staffel-[^/]+/\d+)', season_url)
|
|
if staffel_match:
|
|
staffel_identifier = staffel_match.group(1).lower()
|
|
else:
|
|
# Fallback to full URL normalization for non-standard URLs
|
|
staffel_identifier = season_url.lower().rstrip('/')
|
|
|
|
logger.debug(f"Season identifier: '{staffel_identifier}' from {season_url}")
|
|
|
|
if staffel_identifier in seen_urls:
|
|
logger.debug(f"Skipping duplicate season URL: {season_url}")
|
|
continue
|
|
|
|
# Skip "Übersicht" link
|
|
if season_name.lower() in ['übersicht', 'episoden']:
|
|
continue
|
|
|
|
# Clean up season name: extract year if it's embedded in text
|
|
# e.g., "Bild: NDR2026" -> "2026"
|
|
# Look for 4-digit year anywhere in the string (without word boundaries)
|
|
year_match = re.search(r'(20\d{2}|19\d{2})', season_name)
|
|
if year_match:
|
|
year = year_match.group(1)
|
|
# Check if name starts with image caption (e.g., "Bild: NDR2026")
|
|
if re.match(r'^(Bild:|Foto:)', season_name, re.IGNORECASE):
|
|
season_name = year
|
|
|
|
# Handle duplicate season names by adding Teil 2, Teil 3, etc.
|
|
# This happens when there are multiple seasons with the same name but different URLs
|
|
# (e.g., "2020" regular episodes vs "2020" specials)
|
|
existing_names = [name for name, _ in season_links]
|
|
if season_name in existing_names:
|
|
# Count how many times this base name already exists (including "Teil X" variants)
|
|
base_name = season_name
|
|
count = 1
|
|
# Count both the base name and all "Teil X" variants
|
|
for name in existing_names:
|
|
if name == base_name or name.startswith(f"{base_name} Teil "):
|
|
count += 1
|
|
original_name = season_name
|
|
season_name = f"{season_name} Teil {count}"
|
|
logger.debug(f"Duplicate season name detected: '{original_name}' -> '{season_name}' (URL: {season_url})")
|
|
|
|
seen_urls.add(staffel_identifier)
|
|
season_links.append((season_name, season_url))
|
|
logger.debug(f"Added season: '{season_name}' -> {season_url}")
|
|
|
|
return season_links
|
|
|
|
def _extract_seasons_from_overview(self, soup: BeautifulSoup, start_sort_order: int) -> List[ScrapedSeason]:
|
|
"""
|
|
Extract seasons that are shown directly on the overview page (e.g., Specials)
|
|
|
|
Args:
|
|
soup: BeautifulSoup of overview page
|
|
start_sort_order: Starting sort order number
|
|
|
|
Returns:
|
|
List of ScrapedSeason objects
|
|
"""
|
|
seasons = []
|
|
sort_order = start_sort_order
|
|
|
|
# Find sections with season headers (but no corresponding menu link)
|
|
# These are typically Specials or other special categories
|
|
sections = soup.find_all('section')
|
|
|
|
for section in sections:
|
|
# Look for headers like "Specials", "Extras", etc.
|
|
header = section.find(['h2', 'h3'], id=re.compile(r'Special|Extra'))
|
|
|
|
if not header:
|
|
continue
|
|
|
|
season_name = header.get_text(strip=True)
|
|
|
|
# Skip if this is just a navigation element
|
|
if 'karussell' in season_name.lower():
|
|
continue
|
|
|
|
# Extract episodes from this section
|
|
episodes = self._extract_episodes_from_page(section)
|
|
|
|
if episodes:
|
|
season_type = self.classify_season_type(season_name)
|
|
|
|
season = ScrapedSeason(
|
|
name=season_name,
|
|
season_type=season_type,
|
|
sort_order=sort_order,
|
|
episodes=episodes
|
|
)
|
|
seasons.append(season)
|
|
sort_order += 1
|
|
|
|
return seasons
|
|
|
|
def _extract_grouped_from_overview(self, soup: BeautifulSoup) -> List[ScrapedSeason]:
|
|
"""
|
|
Extract all episodes from overview page and group by episode code prefix.
|
|
Used for series like "Wer weiß denn sowas?" where all seasons have the same name.
|
|
|
|
Args:
|
|
soup: BeautifulSoup of overview page
|
|
|
|
Returns:
|
|
List of ScrapedSeason objects grouped by episode code prefix
|
|
"""
|
|
# First, try to find season links to get proper season numbers
|
|
# Pattern: /episodenguide/11/30583 -> Season 11
|
|
season_links = soup.find_all('a', href=re.compile(r'episodenguide/(\d+)/'))
|
|
season_numbers = {}
|
|
for link in season_links:
|
|
href = link.get('href', '')
|
|
match = re.search(r'episodenguide/(\d+)/', href)
|
|
if match:
|
|
season_num = match.group(1)
|
|
season_numbers[season_num] = True
|
|
|
|
# Find all episode rows
|
|
all_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'})
|
|
|
|
# Group episodes by their code prefix (e.g., "1.01" → "1", "2.001" → "2")
|
|
groups = {}
|
|
|
|
for i, row in enumerate(all_rows, 1):
|
|
episode = self._parse_episode_row(row, i)
|
|
if not episode:
|
|
continue
|
|
|
|
# Extract group from row title attribute (e.g., "1.01 Title" → "1")
|
|
# This is more reliable than episode_code which might be just "01"
|
|
row_title = row.get('title', '')
|
|
title_code_match = re.match(r'^(\d+)\.', row_title)
|
|
|
|
if title_code_match:
|
|
group_key = title_code_match.group(1)
|
|
else:
|
|
# Fallback: try from episode code
|
|
code_match = re.match(r'^(\d+)[x.]', episode.episode_code)
|
|
if not code_match:
|
|
code_match = re.match(r'^(\d+)', episode.episode_code)
|
|
|
|
if code_match:
|
|
group_key = code_match.group(1)
|
|
else:
|
|
group_key = episode.episode_code
|
|
|
|
# Check title for special season indicators (XXL, Quizmarathon, etc.)
|
|
title_lower = episode.title.lower()
|
|
if 'xxl' in title_lower:
|
|
group_key = 'XXL'
|
|
elif 'quizmarathon' in title_lower:
|
|
group_key = 'Quizmarathon'
|
|
|
|
if group_key not in groups:
|
|
groups[group_key] = []
|
|
|
|
groups[group_key].append(episode)
|
|
|
|
# Convert groups to ScrapedSeason objects
|
|
seasons = []
|
|
|
|
# Sort by numeric key (treating group_key as integer when possible)
|
|
def sort_key(item):
|
|
group_key, _ = item
|
|
# Try to convert to int for proper numeric sorting
|
|
try:
|
|
if group_key.isdigit():
|
|
return (0, int(group_key)) # Numeric groups first
|
|
else:
|
|
return (1, group_key) # Non-numeric groups (XXL, etc.) after
|
|
except:
|
|
return (1, group_key)
|
|
|
|
for sort_order, (group_key, episode_list) in enumerate(sorted(groups.items(), key=sort_key)):
|
|
# Fix duplicate episode codes within this group
|
|
episode_list = self._fix_duplicate_episode_codes(episode_list)
|
|
|
|
# Determine season name and type
|
|
if group_key == 'XXL':
|
|
season_name = "Wer weiß denn sowas XXL"
|
|
season_type = SeasonType.EXTRAS
|
|
elif group_key == 'Quizmarathon':
|
|
season_name = "Quizmarathon"
|
|
season_type = SeasonType.SPECIALS
|
|
elif group_key == '0':
|
|
season_name = "Specials"
|
|
season_type = SeasonType.SPECIALS
|
|
elif group_key.isdigit() and group_key in season_numbers:
|
|
# This is a proper season number from the URLs
|
|
season_name = f"Staffel {int(group_key)}"
|
|
season_type = SeasonType.NORMAL
|
|
else:
|
|
# Fallback: use year or group number
|
|
first_ep_date = next((ep.date_de_tv for ep in episode_list if ep.date_de_tv), None)
|
|
if first_ep_date:
|
|
season_name = str(first_ep_date.year)
|
|
season_type = SeasonType.YEAR_BASED
|
|
else:
|
|
season_name = f"Gruppe {group_key}"
|
|
season_type = SeasonType.NORMAL
|
|
|
|
season = ScrapedSeason(
|
|
name=season_name,
|
|
season_type=season_type,
|
|
sort_order=sort_order,
|
|
episodes=episode_list
|
|
)
|
|
seasons.append(season)
|
|
|
|
return seasons
|
|
|
|
def _extract_season_from_overview(self, soup: BeautifulSoup, season_name: str, sort_order: int) -> Optional[ScrapedSeason]:
|
|
"""
|
|
Extract episodes for a specific season from the overview page
|
|
This is used when individual season pages don't have episode_number data
|
|
|
|
Args:
|
|
soup: BeautifulSoup of overview page
|
|
season_name: Name of season to extract (e.g., "Staffel 1")
|
|
sort_order: Sort order number
|
|
|
|
Returns:
|
|
ScrapedSeason or None
|
|
"""
|
|
# Find all episode rows on overview page
|
|
all_episode_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'})
|
|
|
|
if not all_episode_rows:
|
|
return None
|
|
|
|
# Extract episodes and filter by season
|
|
episodes = []
|
|
seen_codes = {}
|
|
|
|
for i, row in enumerate(all_episode_rows, 1):
|
|
# Parse the episode
|
|
episode = self._parse_episode_row(row, i)
|
|
|
|
if not episode:
|
|
continue
|
|
|
|
# Check if episode belongs to this season by looking at the URL
|
|
href = row.get('href', '')
|
|
|
|
# Skip XXL or special versions if we're looking for plain season
|
|
# (These should be handled by their own dedicated pages)
|
|
if 'xxl' in href.lower() and ':' not in season_name.lower():
|
|
continue
|
|
|
|
# Extract season number from href like "/comedystreet/folgen/1x01-..."
|
|
season_match = re.search(r'/folgen/(\d+)x\d+', href)
|
|
|
|
if not season_match:
|
|
continue
|
|
|
|
# Convert season_name like "Staffel 1" to number
|
|
# Don't match "Staffel 1: Something" - only plain "Staffel 1"
|
|
season_num_match = re.search(r'^Staffel\s+(\d+)$', season_name.strip())
|
|
if not season_num_match:
|
|
# Try without "Staffel" prefix - might be just "1", "2", etc.
|
|
season_num_match = re.search(r'^(\d+)$', season_name.strip())
|
|
|
|
if not season_num_match:
|
|
# This is a special season like "Staffel 1: XXL", skip it
|
|
# Those will be handled by their own dedicated pages
|
|
continue
|
|
|
|
expected_season_num = int(season_num_match.group(1))
|
|
actual_season_num = int(season_match.group(1))
|
|
|
|
if expected_season_num != actual_season_num:
|
|
continue
|
|
|
|
# This episode belongs to the requested season
|
|
episodes.append(episode)
|
|
|
|
if not episodes:
|
|
return None
|
|
|
|
# Fix duplicate episode codes
|
|
episodes = self._fix_duplicate_episode_codes(episodes)
|
|
|
|
season_type = self.classify_season_type(season_name)
|
|
|
|
logger.debug(f"Scraped season '{season_name}': {len(episodes)} episodes found")
|
|
|
|
return ScrapedSeason(
|
|
name=season_name,
|
|
season_type=season_type,
|
|
sort_order=sort_order,
|
|
episodes=episodes
|
|
)
|
|
|
|
def _scrape_season_page(self, season_name: str, season_url: str, sort_order: int) -> Optional[ScrapedSeason]:
|
|
"""
|
|
Scrape a single season page
|
|
|
|
Args:
|
|
season_name: Name of the season
|
|
season_url: URL to season page
|
|
sort_order: Sort order number
|
|
|
|
Returns:
|
|
ScrapedSeason or None
|
|
"""
|
|
try:
|
|
response = self.session.get(season_url, timeout=15)
|
|
response.raise_for_status()
|
|
except Exception as e:
|
|
print(f" Fehler beim Laden: {e}")
|
|
return None
|
|
|
|
soup = BeautifulSoup(response.content, 'lxml')
|
|
|
|
# Extract episodes from this page
|
|
episodes = self._extract_episodes_from_page(soup)
|
|
|
|
if not episodes:
|
|
return None
|
|
|
|
season_type = self.classify_season_type(season_name)
|
|
|
|
logger.debug(f"Scraped season '{season_name}': {len(episodes)} episodes found")
|
|
|
|
return ScrapedSeason(
|
|
name=season_name,
|
|
season_type=season_type,
|
|
sort_order=sort_order,
|
|
episodes=episodes
|
|
)
|
|
|
|
def _fix_duplicate_episode_codes(self, episodes: List[ScrapedEpisode]) -> List[ScrapedEpisode]:
|
|
"""
|
|
Fix duplicate episode codes by adding letter suffixes (a, b, c, etc.)
|
|
to ALL episodes that share the same code (including the first occurrence).
|
|
|
|
For example, if three episodes have code "01", they become "01a", "01b", "01c".
|
|
|
|
Args:
|
|
episodes: List of episodes that may contain duplicates
|
|
|
|
Returns:
|
|
List of episodes with unique codes
|
|
"""
|
|
from collections import Counter
|
|
|
|
# Count how many times each code appears
|
|
code_counts = Counter(ep.episode_code for ep in episodes)
|
|
|
|
# Track which suffix to use for each code
|
|
code_suffixes = {}
|
|
|
|
# Process each episode
|
|
for episode in episodes:
|
|
original_code = episode.episode_code
|
|
|
|
# If this code appears more than once, add suffix to ALL occurrences
|
|
if code_counts[original_code] > 1:
|
|
# Get next suffix for this code (a, b, c, ...)
|
|
if original_code not in code_suffixes:
|
|
code_suffixes[original_code] = ord('a')
|
|
|
|
suffix_char = chr(code_suffixes[original_code])
|
|
episode.episode_code = f"{original_code}{suffix_char}"
|
|
code_suffixes[original_code] += 1
|
|
|
|
return episodes
|
|
|
|
def _extract_episodes_from_page(self, soup: BeautifulSoup) -> List[ScrapedEpisode]:
|
|
"""
|
|
Extract episodes from a season page
|
|
|
|
fernsehserien.de uses H3 headers for episode titles within section elements
|
|
OR section elements with itemprop="episode" (alternative layout)
|
|
"""
|
|
episodes = []
|
|
|
|
# Try method 1: Find all H3 elements that start with a number or "Folge X"
|
|
all_h3 = soup.find_all('h3')
|
|
# Accept both "41. Title" and "Folge 42" formats
|
|
episode_h3s = [h3 for h3 in all_h3 if re.match(r'^(\d+[a-z]?\.|Folge\s+\d+)', h3.get_text(strip=True))]
|
|
|
|
if episode_h3s:
|
|
# Method 1: H3-based extraction (main overview page with detailed episode info)
|
|
for h3 in episode_h3s:
|
|
episode = self._parse_episode_h3(h3)
|
|
if episode:
|
|
episodes.append(episode)
|
|
else:
|
|
# Method 2: Try overview page table format with <a role="row" itemprop="episode">
|
|
# This format has episode_number in the cells
|
|
episode_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'})
|
|
if episode_rows:
|
|
for i, row in enumerate(episode_rows, 1):
|
|
episode = self._parse_episode_row(row, i)
|
|
if episode:
|
|
episodes.append(episode)
|
|
else:
|
|
# Method 3: Try alternative layout with section[itemprop="episode"]
|
|
# This is used on some dedicated season pages (e.g., ComedyStreet Staffel 6+)
|
|
episode_sections = soup.find_all('section', itemprop='episode')
|
|
for i, section in enumerate(episode_sections, 1):
|
|
episode = self._parse_episode_section(section, i)
|
|
if episode:
|
|
episodes.append(episode)
|
|
|
|
# After collecting all episodes, handle duplicate episode_codes
|
|
# This ensures ALL episodes with the same code get suffixes (a, b, c, etc.)
|
|
episodes = self._fix_duplicate_episode_codes(episodes)
|
|
|
|
return episodes
|
|
|
|
def _parse_episode_h3(self, h3) -> Optional[ScrapedEpisode]:
|
|
"""
|
|
Parse an episode from an H3 header
|
|
|
|
Format: "1. Episode Title (Original Title)"
|
|
The parent section contains date information in text format
|
|
"""
|
|
title_text = h3.get_text(strip=True)
|
|
|
|
# Extract episode_number (overall series number) from H3 text
|
|
# Example: "111. Episode Title" -> episode_number = 111
|
|
# Example: "Folge 42" -> episode_number = 42
|
|
episode_number = None
|
|
number_match = re.match(r'^(\d+)[a-z]?\.', title_text)
|
|
if not number_match:
|
|
# Try "Folge X" format
|
|
number_match = re.match(r'^Folge\s+(\d+)', title_text)
|
|
if number_match:
|
|
try:
|
|
episode_number = int(number_match.group(1))
|
|
except ValueError:
|
|
pass
|
|
|
|
# Extract title (everything after the number or "Folge X")
|
|
title_match = re.match(r'^\d+[a-z]?\.?\s*(.+)', title_text)
|
|
if not title_match:
|
|
# Try "Folge X Title" format
|
|
title_match = re.match(r'^Folge\s+\d+\s*(.+)?', title_text)
|
|
if title_match:
|
|
title = title_match.group(1).strip() if title_match.group(1) else title_text
|
|
else:
|
|
title = title_text
|
|
|
|
# Remove English title in parentheses if present
|
|
# Example: "Gewöhnliche Leute(Nosedive)" -> "Gewöhnliche Leute"
|
|
# Example: "White Christmas" -> "White Christmas" (no change if no German title)
|
|
title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip()
|
|
|
|
# Get the parent section that contains date information
|
|
section = h3.find_parent('section')
|
|
if not section:
|
|
# Fallback: just return episode without dates
|
|
return ScrapedEpisode(
|
|
episode_code="00",
|
|
title=title,
|
|
episode_number=episode_number
|
|
)
|
|
|
|
# Extract episode_code (season-specific episode number) and episode_id from episode link
|
|
# Format: /folgen/12x01-title-1828679 or /folgen/01-title-1828679 (for specials)
|
|
episode_code = None
|
|
episode_id = None
|
|
episode_link = section.find('a', href=re.compile(r'/folgen/'))
|
|
if episode_link:
|
|
href = episode_link.get('href', '')
|
|
# Extract episode_id (last number in URL)
|
|
episode_id_match = re.search(r'-(\d+)$', href)
|
|
if episode_id_match:
|
|
episode_id = episode_id_match.group(1)
|
|
|
|
# Try format: /folgen/12x01-... (regular episodes)
|
|
season_episode_match = re.search(r'/folgen/(\d+)x(\d+)', href)
|
|
if season_episode_match:
|
|
episode_code = season_episode_match.group(2).zfill(2)
|
|
else:
|
|
# Try format: /folgen/01-... (specials without season prefix)
|
|
special_match = re.search(r'/folgen/(\d+)-', href)
|
|
if special_match:
|
|
episode_code = special_match.group(1).zfill(2)
|
|
|
|
# Fallback: extract from H3 text if link extraction failed
|
|
# Examples: "0.01 Title" -> "01", "1. Title" -> "01", "12a. Title" -> "12a"
|
|
if not episode_code:
|
|
# Try to find pattern like "X.YY" in title text (e.g., "0.01")
|
|
decimal_match = re.match(r'^\d+\.(\d+[a-z]?)', title_text)
|
|
if decimal_match:
|
|
ep_num = decimal_match.group(1)
|
|
if ep_num.isdigit():
|
|
episode_code = ep_num.zfill(2)
|
|
else:
|
|
# Handle cases like "12a"
|
|
episode_code = ep_num[:-1].zfill(2) + ep_num[-1] if len(ep_num) >= 2 else ep_num.zfill(2)
|
|
else:
|
|
# Last resort: use the episode number from start of H3
|
|
episode_code = self.extract_episode_code(title_text)
|
|
|
|
# Extract dates from the section text
|
|
section_text = section.get_text()
|
|
dates = self._extract_dates_from_text(section_text)
|
|
|
|
return ScrapedEpisode(
|
|
episode_code=episode_code,
|
|
title=title,
|
|
episode_number=episode_number,
|
|
episode_id=episode_id,
|
|
date_de_tv=dates.get('de_tv'),
|
|
date_de_streaming=dates.get('de_streaming'),
|
|
date_de_home_media=dates.get('de_home_media'),
|
|
date_de_sync=dates.get('de_sync'),
|
|
date_original=dates.get('original')
|
|
)
|
|
|
|
def _parse_episode_section(self, section, fallback_number: int) -> Optional[ScrapedEpisode]:
|
|
"""
|
|
Parse an episode from a section element with itemprop="episode"
|
|
Used on dedicated season pages with alternative layout (e.g., ComedyStreet Staffel 1-5)
|
|
|
|
Args:
|
|
section: BeautifulSoup section element
|
|
fallback_number: Episode number to use if can't extract from URL
|
|
"""
|
|
# Extract title from itemprop="name"
|
|
title_elem = section.find(itemprop='name')
|
|
if not title_elem:
|
|
return None
|
|
|
|
title = title_elem.get_text(strip=True)
|
|
|
|
# Remove English title in parentheses if present
|
|
title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip()
|
|
|
|
# Try to extract episode info from URL: href="/series/folgen/1x01-title-1828679"
|
|
url_elem = section.find('a', itemprop='url')
|
|
episode_code = None
|
|
episode_number = None
|
|
episode_id = None
|
|
|
|
if url_elem:
|
|
href = url_elem.get('href', '')
|
|
# Extract episode_id (last number in URL)
|
|
episode_id_match = re.search(r'-(\d+)$', href)
|
|
if episode_id_match:
|
|
episode_id = episode_id_match.group(1)
|
|
|
|
# Pattern: /folgen/1x01-... or /folgen/SxE-...
|
|
match = re.search(r'/folgen/(\d+)x(\d+)', href)
|
|
if match:
|
|
episode_code = match.group(2).zfill(2)
|
|
|
|
# Try to extract episode_number from <div role="cell" itemprop="episodeNumber">
|
|
# Structure on overview page: <div content="1" itemprop="episodeNumber" role="cell">01</div>
|
|
# The content attribute contains the overall episode number
|
|
cell_div = section.find('div', itemprop='episodeNumber')
|
|
if cell_div:
|
|
# Extract from content attribute
|
|
content_attr = cell_div.get('content')
|
|
if content_attr:
|
|
try:
|
|
episode_number = int(content_attr)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Fallback: use sequential numbering
|
|
if not episode_code:
|
|
episode_code = str(fallback_number).zfill(2)
|
|
|
|
# Extract dates
|
|
section_text = section.get_text()
|
|
dates = self._extract_dates_from_text(section_text)
|
|
|
|
return ScrapedEpisode(
|
|
episode_code=episode_code,
|
|
title=title,
|
|
episode_number=episode_number,
|
|
episode_id=episode_id,
|
|
date_de_tv=dates.get('de_tv'),
|
|
date_de_streaming=dates.get('de_streaming'),
|
|
date_de_home_media=dates.get('de_home_media'),
|
|
date_de_sync=dates.get('de_sync'),
|
|
date_original=dates.get('original')
|
|
)
|
|
|
|
def _parse_episode_row(self, row, fallback_number: int) -> Optional[ScrapedEpisode]:
|
|
"""
|
|
Parse an episode from a row element (<a role="row" itemprop="episode">)
|
|
Used on overview pages with table format
|
|
|
|
Args:
|
|
row: BeautifulSoup <a> element with role="row"
|
|
fallback_number: Episode number to use if can't extract
|
|
"""
|
|
# Extract episode_id from row href (e.g., /series/folgen/1x01-title-1828679)
|
|
episode_id = None
|
|
href = row.get('href', '')
|
|
if href:
|
|
episode_id_match = re.search(r'-(\d+)$', href)
|
|
if episode_id_match:
|
|
episode_id = episode_id_match.group(1)
|
|
|
|
# Get all cells in the row
|
|
cells = row.find_all('div', role='cell')
|
|
|
|
if len(cells) < 7:
|
|
return None
|
|
|
|
# Cell structure based on test output:
|
|
# Cell 2: Contains overall episode number (before span) + season.episode (in span)
|
|
# Cell 5: Contains episode_code with itemprop="episodeNumber"
|
|
# Cell 7: Contains title with itemprop="name"
|
|
# Cell 8: Contains date
|
|
|
|
# Extract overall episode_number from cell 2
|
|
episode_number = None
|
|
if len(cells) >= 2:
|
|
cell2 = cells[1] # Index 1 = cell 2
|
|
# Structure: <div role="cell">1<span class="episodenliste-schmal"><b>1.01</b></span></div>
|
|
# Extract the number before the span
|
|
cell2_text = ''
|
|
for child in cell2.children:
|
|
if isinstance(child, str):
|
|
cell2_text += child.strip()
|
|
else:
|
|
# Stop at first tag (the span)
|
|
break
|
|
|
|
if cell2_text:
|
|
try:
|
|
episode_number = int(cell2_text)
|
|
except ValueError:
|
|
pass
|
|
|
|
# Extract episode_code from cell 5 (itemprop="episodeNumber")
|
|
episode_code = None
|
|
ep_code_cell = row.find('div', itemprop='episodeNumber')
|
|
if ep_code_cell:
|
|
code_text = ep_code_cell.get_text(strip=True)
|
|
if code_text:
|
|
episode_code = code_text.zfill(2)
|
|
|
|
# Fallback for episode_code
|
|
if not episode_code:
|
|
episode_code = str(fallback_number).zfill(2)
|
|
|
|
# Extract title from cell 7
|
|
title = ""
|
|
title_cell = row.find('div', class_='episodenliste-2019-episodentitel')
|
|
if title_cell:
|
|
title_elem = title_cell.find(itemprop='name')
|
|
if title_elem:
|
|
title = title_elem.get_text(strip=True)
|
|
# Remove English title in parentheses if present
|
|
title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip()
|
|
|
|
# Extract date from cell 8 (simple date text)
|
|
date_de_tv = None
|
|
if len(cells) >= 8:
|
|
date_cell = cells[7] # Index 7 = cell 8
|
|
date_text = date_cell.get_text(strip=True)
|
|
if date_text:
|
|
date_de_tv = self.parse_german_date(date_text)
|
|
|
|
return ScrapedEpisode(
|
|
episode_code=episode_code,
|
|
title=title,
|
|
episode_number=episode_number,
|
|
episode_id=episode_id,
|
|
date_de_tv=date_de_tv,
|
|
date_de_streaming=None,
|
|
date_de_home_media=None,
|
|
date_de_sync=None,
|
|
date_original=None
|
|
)
|
|
|
|
def _extract_dates_from_text(self, text: str) -> Dict[str, Optional[datetime]]:
|
|
"""
|
|
Extract dates from plain text containing German date labels
|
|
|
|
Expected format:
|
|
- "Deutsche TV-Premiere Mi. 11.12.2013 RTL Crime"
|
|
- "Deutsche Streaming-Premiere Fr. 21.10.2016 Netflix"
|
|
- "Deutsche Home-Media-Premiere Do. 21.11.2024"
|
|
- "Original-TV-Premiere So. 04.12.2011 Channel 4"
|
|
- "Premiere der deutschen Synchronfassung ..."
|
|
"""
|
|
dates = {
|
|
'de_tv': None,
|
|
'de_streaming': None,
|
|
'de_home_media': None,
|
|
'de_sync': None,
|
|
'original': None
|
|
}
|
|
|
|
# Search for "Deutsche TV-Premiere" followed by a date
|
|
match = re.search(r'Deutsche\s+TV-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
|
|
if match:
|
|
dates['de_tv'] = self.parse_german_date(match.group(1))
|
|
|
|
# Search for "Deutsche Streaming-Premiere"
|
|
match = re.search(r'Deutsche\s+Streaming-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
|
|
if match:
|
|
dates['de_streaming'] = self.parse_german_date(match.group(1))
|
|
|
|
# Search for "Deutsche Home-Media-Premiere"
|
|
match = re.search(r'Deutsche\s+Home-Media-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
|
|
if match:
|
|
dates['de_home_media'] = self.parse_german_date(match.group(1))
|
|
|
|
# Search for "Premiere der deutschen Synchronfassung"
|
|
match = re.search(r'Premiere\s+der\s+deutschen\s+Synchronfassung\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
|
|
if match:
|
|
dates['de_sync'] = self.parse_german_date(match.group(1))
|
|
|
|
# Search for "Original-TV-Premiere" or "Original-Streaming-Premiere"
|
|
match = re.search(r'Original-(?:TV|Streaming)-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
|
|
if match:
|
|
dates['original'] = self.parse_german_date(match.group(1))
|
|
|
|
return dates
|
|
|
|
|
|
|
|
|
|
def main():
|
|
"""Test the scraper"""
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python fernsehserien_scraper.py <url>")
|
|
print("Example: python fernsehserien_scraper.py https://www.fernsehserien.de/black-mirror/episodenguide")
|
|
sys.exit(1)
|
|
|
|
url = sys.argv[1]
|
|
|
|
scraper = FernsehserienScraper()
|
|
title, seasons = scraper.scrape_series(url)
|
|
|
|
print(f"\n=== {title} ===")
|
|
print(f"Staffeln gesamt: {len(seasons)}\n")
|
|
|
|
for season in seasons:
|
|
print(f"{season.name} ({season.season_type.value}): {len(season.episodes)} Episoden")
|
|
for ep in season.episodes[:3]: # Show first 3
|
|
dates = []
|
|
if ep.date_original:
|
|
dates.append(f"Orig: {ep.date_original.strftime('%d.%m.%Y')}")
|
|
if ep.date_de_tv:
|
|
dates.append(f"DE: {ep.date_de_tv.strftime('%d.%m.%Y')}")
|
|
|
|
date_str = ", ".join(dates) if dates else "Keine Daten"
|
|
print(f" {ep.episode_code}. {ep.title} ({date_str})")
|
|
|
|
if len(season.episodes) > 3:
|
|
print(f" ... und {len(season.episodes) - 3} weitere")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|