Files
Serien-Checker/serien_checker/scraper/fernsehserien_scraper.py
2025-12-21 14:35:08 +01:00

1096 lines
43 KiB
Python

#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "requests",
# "beautifulsoup4",
# "lxml",
# ]
# ///
"""
Scraper for fernsehserien.de using BeautifulSoup
This is a standalone scraper that can be used independently
"""
import re
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Optional, Tuple
from datetime import datetime
from dataclasses import dataclass
import sys
from pathlib import Path
# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from serien_checker.database.models import SeasonType
from serien_checker.scraper.browser_scraper import ScrapedEpisode, ScrapedSeason
from serien_checker.utils.logger import setup_logger
logger = setup_logger()
class FernsehserienScraper:
"""
Scraper for fernsehserien.de using requests + BeautifulSoup
"""
BASE_URL = "https://www.fernsehserien.de"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
def __init__(self):
self.session = requests.Session()
self.session.headers.update(self.HEADERS)
@staticmethod
def parse_german_date(date_str: str) -> Optional[datetime]:
"""
Parse German date format to datetime
Supports formats:
- DD.MM.YYYY
- DD.MM.YY
- YYYY
"""
if not date_str or date_str.strip() == "":
return None
date_str = date_str.strip()
# Try DD.MM.YYYY or DD.MM.YY
patterns = [
(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', '%d.%m.%Y'),
(r'(\d{1,2})\.(\d{1,2})\.(\d{2})', '%d.%m.%y'),
]
for pattern, fmt in patterns:
match = re.search(pattern, date_str)
if match:
try:
return datetime.strptime(match.group(0), fmt)
except ValueError:
continue
# Try just year (YYYY)
year_match = re.search(r'\b(19\d{2}|20\d{2})\b', date_str)
if year_match:
try:
return datetime(int(year_match.group(1)), 1, 1)
except ValueError:
pass
return None
@staticmethod
def classify_season_type(season_name: str) -> SeasonType:
"""Classify season type based on name"""
name_lower = season_name.lower()
if any(keyword in name_lower for keyword in ['special', 'specials']):
return SeasonType.SPECIALS
if any(keyword in name_lower for keyword in ['extra', 'extras', 'bonus']):
return SeasonType.EXTRAS
if any(keyword in name_lower for keyword in ['best', 'best-of', 'best of']):
return SeasonType.BEST_OF
if re.match(r'^(19|20)\d{2}$', season_name.strip()):
return SeasonType.YEAR_BASED
return SeasonType.NORMAL
@staticmethod
def extract_episode_code(episode_text: str) -> str:
"""
Extract episode code from text
Examples: "1. Folge" -> "01", "12a. Teil A" -> "12a"
"""
match = re.search(r'^(\d+[a-z]?)\.', episode_text.strip())
if match:
code = match.group(1)
if code.isdigit():
return code.zfill(2)
elif len(code) >= 2 and code[:-1].isdigit():
return code[:-1].zfill(2) + code[-1]
return "00"
def scrape_series(self, url: str) -> Tuple[str, List[ScrapedSeason]]:
"""
Scrape series from fernsehserien.de
This scraper works in two steps:
1. Scrape overview page to get season links
2. Scrape each season page to get episodes
Args:
url: URL to episode guide (overview page)
Returns:
Tuple of (series_title, list of ScrapedSeason)
"""
print(f"Scraping overview {url}...")
response = self.session.get(url, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
# Extract series title
series_title = self._extract_series_title(soup)
print(f"Serie: {series_title}")
# Find season links from the series menu
season_links = self._extract_season_links(soup, url)
print(f"Gefunden: {len(season_links)} Staffeln mit eigenen Seiten")
# Check if most season names contain "bisher X Folgen" pattern
# If so, we need to group episodes by code prefix instead
bisher_pattern = re.compile(r'bisher.*\d+.*Folgen', re.IGNORECASE)
has_bisher = [bool(bisher_pattern.search(name)) for name, _ in season_links]
bisher_count = sum(has_bisher)
mostly_bisher = bisher_count > len(season_links) // 2 and season_links
if mostly_bisher:
# Special case: All seasons have same name (like "bisher 1369 Folgen")
# Extract and group episodes from overview page by episode code prefix
print(f"Alle Staffeln heißen '{season_links[0][0]}' - gruppiere nach Episode-Code")
seasons = self._extract_grouped_from_overview(soup)
print(f"Gefunden: {len(seasons)} gruppierte Staffeln mit insgesamt {sum(len(s.episodes) for s in seasons)} Episoden")
return series_title, seasons
# Scrape each season from dedicated pages
# First, check if any season names have duplicates (same base number)
season_names = [name for name, _ in season_links]
has_duplicate_seasons = False
for name in season_names:
# Extract base season number (e.g., "Staffel 6" from "Staffel 6: Video-Podcast")
base_match = re.search(r'Staffel\s+(\d+)', name)
if base_match:
base_num = base_match.group(1)
# Count how many seasons have this base number
count = sum(1 for n in season_names if f'Staffel {base_num}' in n)
if count > 1:
has_duplicate_seasons = True
break
seasons = []
for i, (season_name, season_url) in enumerate(season_links):
print(f" Lade {season_name}...")
season = self._scrape_season_page(season_name, season_url, i)
# If season has no episodes or episodes have no episode_number,
# try to extract from overview page instead
if season and season.episodes:
# Check if any episode has episode_number
has_episode_numbers = any(ep.episode_number is not None for ep in season.episodes)
# Don't use overview page if there are duplicate season numbers
# (e.g., "Staffel 6" and "Staffel 6: Video-Podcast")
# because the overview page can't distinguish between variants with same base number
skip_overview = has_duplicate_seasons
# Also check if episode_numbers look wrong (e.g., starting at 1 for each season)
# This happens when scraping from <section itemprop="episode"> which only has
# season-relative numbers, not overall series numbers
needs_overview = False
if has_episode_numbers and not skip_overview:
# If all episodes have numbers 1, 2, 3... but this isn't Staffel 1,
# the numbers are probably wrong (season-relative instead of series-wide)
first_ep_num = next((ep.episode_number for ep in season.episodes if ep.episode_number), None)
if first_ep_num == 1 and i > 0: # i > 0 means not the first season
needs_overview = True
if (not has_episode_numbers or needs_overview) and not skip_overview:
# Try extracting from overview page (only when safe)
overview_season = self._extract_season_from_overview(soup, season_name, i)
if overview_season and overview_season.episodes:
season = overview_season
print(f" {len(season.episodes)} Episoden (von Übersichtsseite)")
else:
print(f" {len(season.episodes)} Episoden")
else:
print(f" {len(season.episodes)} Episoden")
seasons.append(season)
elif season:
# Empty season, try overview page (only when safe)
if not skip_overview:
overview_season = self._extract_season_from_overview(soup, season_name, i)
if overview_season and overview_season.episodes:
seasons.append(overview_season)
print(f" {len(overview_season.episodes)} Episoden (von Übersichtsseite)")
# Also check for seasons directly on overview page (e.g., Specials)
overview_seasons = self._extract_seasons_from_overview(soup, len(seasons))
if overview_seasons:
# Track existing season names to avoid duplicates
existing_names = {s.name for s in seasons}
new_seasons = [s for s in overview_seasons if s.name not in existing_names]
if new_seasons:
print(f"Gefunden: {len(new_seasons)} zusätzliche Staffeln auf Übersichtsseite")
for season in new_seasons:
seasons.append(season)
print(f" {season.name}: {len(season.episodes)} Episoden")
return series_title, seasons
def _extract_series_title(self, soup: BeautifulSoup) -> str:
"""Extract series title from page"""
# Try meta tags first
og_title = soup.find('meta', property='og:title')
if og_title and og_title.get('content'):
title = og_title['content']
# Remove ": Episodenguide" suffix
title = re.sub(r':\s*Episodenguide.*$', '', title, flags=re.IGNORECASE)
return title.strip()
# Fallback to h1
h1 = soup.find('h1')
if h1:
return h1.get_text(strip=True)
return "Unbekannte Serie"
def _extract_season_links(self, soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]:
"""
Extract season links from the series menu
Returns:
List of (season_name, season_url) tuples
"""
season_links = []
seen_urls = set()
# Collect links from multiple sources
links = []
# 1. Try to find the series menu navigation (newer layout)
series_menu = soup.find('nav', class_='series-menu')
if series_menu:
# Find the episodenguide submenu
episode_menu = series_menu.find('li', {'data-menu-item': 'episodenguide'})
if episode_menu:
# Same pattern as global search - no trailing slash required
links.extend(episode_menu.find_all('a', href=re.compile(r'episodenguide/(staffel-[^/]+|\d+)')))
# 2. Search globally for season links (works for pages without series-menu)
# Pattern matches: /episodenguide/staffel-1/, /episodenguide/staffel-1/18522, /episodenguide/0/, etc.
# Note: No trailing slash required - URLs can end with /staffel-1 or /staffel-1/12345
global_links = soup.find_all('a', href=re.compile(r'episodenguide/(staffel-[^/]+|\d+)'))
links.extend(global_links)
for link in links:
# Extract season name more robustly
# First, try to get text from strong/b tags only (ignoring image alt text)
strong_tag = link.find(['strong', 'b'])
if strong_tag:
season_name = strong_tag.get_text(strip=True)
else:
# Fallback: get direct text children only (exclude nested elements like img)
season_name = ''.join(link.find_all(string=True, recursive=False)).strip()
# If still empty, use full text
if not season_name:
season_name = link.get_text(strip=True)
# Clean up image captions that might leak through
season_name = re.sub(r'Bild:\s*[^A-Z]*(?=[A-Z])', '', season_name)
season_name = re.sub(r'Foto:\s*[^A-Z]*(?=[A-Z])', '', season_name)
# Normalize whitespace (convert "Staffel6" or "Staffel 6" to "Staffel 6")
season_name = ' '.join(season_name.split())
season_url = link.get('href', '')
# If season name is just "Staffel" without number, try to extract from URL
if season_name.lower() in ['staffel', 'season']:
# Try to extract season number from URL like "staffel-6/47453"
url_match = re.search(r'/staffel-(\d+)', season_url)
if url_match:
season_num = url_match.group(1)
season_name = f"Staffel {season_num}"
logger.debug(f"Added season number from URL: '{season_name}'")
logger.debug(f"Extracted season name: '{season_name}' from link {season_url}")
# Skip navigation/anchor links
if not season_url or season_url.startswith('#'):
continue
if season_url:
# Make absolute URL
if season_url.startswith('/'):
season_url = self.BASE_URL + season_url
elif not season_url.startswith('http'):
# Relative URL like "episodenguide/0/28673"
# Need to combine with base URL path
from urllib.parse import urljoin
season_url = urljoin(base_url, season_url)
# Skip duplicates (extract staffel identifier for robust comparison)
# This ignores different series slugs (e.g., nachtstreife-2020 vs nachtstreife-2-0)
staffel_match = re.search(r'/(staffel-[^/]+/\d+)', season_url)
if staffel_match:
staffel_identifier = staffel_match.group(1).lower()
else:
# Fallback to full URL normalization for non-standard URLs
staffel_identifier = season_url.lower().rstrip('/')
logger.debug(f"Season identifier: '{staffel_identifier}' from {season_url}")
if staffel_identifier in seen_urls:
logger.debug(f"Skipping duplicate season URL: {season_url}")
continue
# Skip "Übersicht" link
if season_name.lower() in ['übersicht', 'episoden']:
continue
# Clean up season name: extract year if it's embedded in text
# e.g., "Bild: NDR2026" -> "2026"
# Look for 4-digit year anywhere in the string (without word boundaries)
year_match = re.search(r'(20\d{2}|19\d{2})', season_name)
if year_match:
year = year_match.group(1)
# Check if name starts with image caption (e.g., "Bild: NDR2026")
if re.match(r'^(Bild:|Foto:)', season_name, re.IGNORECASE):
season_name = year
# Handle duplicate season names by adding Teil 2, Teil 3, etc.
# This happens when there are multiple seasons with the same name but different URLs
# (e.g., "2020" regular episodes vs "2020" specials)
existing_names = [name for name, _ in season_links]
if season_name in existing_names:
# Count how many times this base name already exists (including "Teil X" variants)
base_name = season_name
count = 1
# Count both the base name and all "Teil X" variants
for name in existing_names:
if name == base_name or name.startswith(f"{base_name} Teil "):
count += 1
original_name = season_name
season_name = f"{season_name} Teil {count}"
logger.debug(f"Duplicate season name detected: '{original_name}' -> '{season_name}' (URL: {season_url})")
seen_urls.add(staffel_identifier)
season_links.append((season_name, season_url))
logger.debug(f"Added season: '{season_name}' -> {season_url}")
return season_links
def _extract_seasons_from_overview(self, soup: BeautifulSoup, start_sort_order: int) -> List[ScrapedSeason]:
"""
Extract seasons that are shown directly on the overview page (e.g., Specials)
Args:
soup: BeautifulSoup of overview page
start_sort_order: Starting sort order number
Returns:
List of ScrapedSeason objects
"""
seasons = []
sort_order = start_sort_order
# Find sections with season headers (but no corresponding menu link)
# These are typically Specials or other special categories
sections = soup.find_all('section')
for section in sections:
# Look for headers like "Specials", "Extras", etc.
header = section.find(['h2', 'h3'], id=re.compile(r'Special|Extra'))
if not header:
continue
season_name = header.get_text(strip=True)
# Skip if this is just a navigation element
if 'karussell' in season_name.lower():
continue
# Extract episodes from this section
episodes = self._extract_episodes_from_page(section)
if episodes:
season_type = self.classify_season_type(season_name)
season = ScrapedSeason(
name=season_name,
season_type=season_type,
sort_order=sort_order,
episodes=episodes
)
seasons.append(season)
sort_order += 1
return seasons
def _extract_grouped_from_overview(self, soup: BeautifulSoup) -> List[ScrapedSeason]:
"""
Extract all episodes from overview page and group by episode code prefix.
Used for series like "Wer weiß denn sowas?" where all seasons have the same name.
Args:
soup: BeautifulSoup of overview page
Returns:
List of ScrapedSeason objects grouped by episode code prefix
"""
# First, try to find season links to get proper season numbers
# Pattern: /episodenguide/11/30583 -> Season 11
season_links = soup.find_all('a', href=re.compile(r'episodenguide/(\d+)/'))
season_numbers = {}
for link in season_links:
href = link.get('href', '')
match = re.search(r'episodenguide/(\d+)/', href)
if match:
season_num = match.group(1)
season_numbers[season_num] = True
# Find all episode rows
all_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'})
# Group episodes by their code prefix (e.g., "1.01" → "1", "2.001" → "2")
groups = {}
for i, row in enumerate(all_rows, 1):
episode = self._parse_episode_row(row, i)
if not episode:
continue
# Extract group from row title attribute (e.g., "1.01 Title" → "1")
# This is more reliable than episode_code which might be just "01"
row_title = row.get('title', '')
title_code_match = re.match(r'^(\d+)\.', row_title)
if title_code_match:
group_key = title_code_match.group(1)
else:
# Fallback: try from episode code
code_match = re.match(r'^(\d+)[x.]', episode.episode_code)
if not code_match:
code_match = re.match(r'^(\d+)', episode.episode_code)
if code_match:
group_key = code_match.group(1)
else:
group_key = episode.episode_code
# Check title for special season indicators (XXL, Quizmarathon, etc.)
title_lower = episode.title.lower()
if 'xxl' in title_lower:
group_key = 'XXL'
elif 'quizmarathon' in title_lower:
group_key = 'Quizmarathon'
if group_key not in groups:
groups[group_key] = []
groups[group_key].append(episode)
# Convert groups to ScrapedSeason objects
seasons = []
# Sort by numeric key (treating group_key as integer when possible)
def sort_key(item):
group_key, _ = item
# Try to convert to int for proper numeric sorting
try:
if group_key.isdigit():
return (0, int(group_key)) # Numeric groups first
else:
return (1, group_key) # Non-numeric groups (XXL, etc.) after
except:
return (1, group_key)
for sort_order, (group_key, episode_list) in enumerate(sorted(groups.items(), key=sort_key)):
# Fix duplicate episode codes within this group
episode_list = self._fix_duplicate_episode_codes(episode_list)
# Determine season name and type
if group_key == 'XXL':
season_name = "Wer weiß denn sowas XXL"
season_type = SeasonType.EXTRAS
elif group_key == 'Quizmarathon':
season_name = "Quizmarathon"
season_type = SeasonType.SPECIALS
elif group_key == '0':
season_name = "Specials"
season_type = SeasonType.SPECIALS
elif group_key.isdigit() and group_key in season_numbers:
# This is a proper season number from the URLs
season_name = f"Staffel {int(group_key)}"
season_type = SeasonType.NORMAL
else:
# Fallback: use year or group number
first_ep_date = next((ep.date_de_tv for ep in episode_list if ep.date_de_tv), None)
if first_ep_date:
season_name = str(first_ep_date.year)
season_type = SeasonType.YEAR_BASED
else:
season_name = f"Gruppe {group_key}"
season_type = SeasonType.NORMAL
season = ScrapedSeason(
name=season_name,
season_type=season_type,
sort_order=sort_order,
episodes=episode_list
)
seasons.append(season)
return seasons
def _extract_season_from_overview(self, soup: BeautifulSoup, season_name: str, sort_order: int) -> Optional[ScrapedSeason]:
"""
Extract episodes for a specific season from the overview page
This is used when individual season pages don't have episode_number data
Args:
soup: BeautifulSoup of overview page
season_name: Name of season to extract (e.g., "Staffel 1")
sort_order: Sort order number
Returns:
ScrapedSeason or None
"""
# Find all episode rows on overview page
all_episode_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'})
if not all_episode_rows:
return None
# Extract episodes and filter by season
episodes = []
seen_codes = {}
for i, row in enumerate(all_episode_rows, 1):
# Parse the episode
episode = self._parse_episode_row(row, i)
if not episode:
continue
# Check if episode belongs to this season by looking at the URL
href = row.get('href', '')
# Skip XXL or special versions if we're looking for plain season
# (These should be handled by their own dedicated pages)
if 'xxl' in href.lower() and ':' not in season_name.lower():
continue
# Extract season number from href like "/comedystreet/folgen/1x01-..."
season_match = re.search(r'/folgen/(\d+)x\d+', href)
if not season_match:
continue
# Convert season_name like "Staffel 1" to number
# Don't match "Staffel 1: Something" - only plain "Staffel 1"
season_num_match = re.search(r'^Staffel\s+(\d+)$', season_name.strip())
if not season_num_match:
# Try without "Staffel" prefix - might be just "1", "2", etc.
season_num_match = re.search(r'^(\d+)$', season_name.strip())
if not season_num_match:
# This is a special season like "Staffel 1: XXL", skip it
# Those will be handled by their own dedicated pages
continue
expected_season_num = int(season_num_match.group(1))
actual_season_num = int(season_match.group(1))
if expected_season_num != actual_season_num:
continue
# This episode belongs to the requested season
episodes.append(episode)
if not episodes:
return None
# Fix duplicate episode codes
episodes = self._fix_duplicate_episode_codes(episodes)
season_type = self.classify_season_type(season_name)
logger.debug(f"Scraped season '{season_name}': {len(episodes)} episodes found")
return ScrapedSeason(
name=season_name,
season_type=season_type,
sort_order=sort_order,
episodes=episodes
)
def _scrape_season_page(self, season_name: str, season_url: str, sort_order: int) -> Optional[ScrapedSeason]:
"""
Scrape a single season page
Args:
season_name: Name of the season
season_url: URL to season page
sort_order: Sort order number
Returns:
ScrapedSeason or None
"""
try:
response = self.session.get(season_url, timeout=15)
response.raise_for_status()
except Exception as e:
print(f" Fehler beim Laden: {e}")
return None
soup = BeautifulSoup(response.content, 'lxml')
# Extract episodes from this page
episodes = self._extract_episodes_from_page(soup)
if not episodes:
return None
season_type = self.classify_season_type(season_name)
logger.debug(f"Scraped season '{season_name}': {len(episodes)} episodes found")
return ScrapedSeason(
name=season_name,
season_type=season_type,
sort_order=sort_order,
episodes=episodes
)
def _fix_duplicate_episode_codes(self, episodes: List[ScrapedEpisode]) -> List[ScrapedEpisode]:
"""
Fix duplicate episode codes by adding letter suffixes (a, b, c, etc.)
to ALL episodes that share the same code (including the first occurrence).
For example, if three episodes have code "01", they become "01a", "01b", "01c".
Args:
episodes: List of episodes that may contain duplicates
Returns:
List of episodes with unique codes
"""
from collections import Counter
# Count how many times each code appears
code_counts = Counter(ep.episode_code for ep in episodes)
# Track which suffix to use for each code
code_suffixes = {}
# Process each episode
for episode in episodes:
original_code = episode.episode_code
# If this code appears more than once, add suffix to ALL occurrences
if code_counts[original_code] > 1:
# Get next suffix for this code (a, b, c, ...)
if original_code not in code_suffixes:
code_suffixes[original_code] = ord('a')
suffix_char = chr(code_suffixes[original_code])
episode.episode_code = f"{original_code}{suffix_char}"
code_suffixes[original_code] += 1
return episodes
def _extract_episodes_from_page(self, soup: BeautifulSoup) -> List[ScrapedEpisode]:
"""
Extract episodes from a season page
fernsehserien.de uses H3 headers for episode titles within section elements
OR section elements with itemprop="episode" (alternative layout)
"""
episodes = []
# Try method 1: Find all H3 elements that start with a number or "Folge X"
all_h3 = soup.find_all('h3')
# Accept both "41. Title" and "Folge 42" formats
episode_h3s = [h3 for h3 in all_h3 if re.match(r'^(\d+[a-z]?\.|Folge\s+\d+)', h3.get_text(strip=True))]
if episode_h3s:
# Method 1: H3-based extraction (main overview page with detailed episode info)
for h3 in episode_h3s:
episode = self._parse_episode_h3(h3)
if episode:
episodes.append(episode)
else:
# Method 2: Try overview page table format with <a role="row" itemprop="episode">
# This format has episode_number in the cells
episode_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'})
if episode_rows:
for i, row in enumerate(episode_rows, 1):
episode = self._parse_episode_row(row, i)
if episode:
episodes.append(episode)
else:
# Method 3: Try alternative layout with section[itemprop="episode"]
# This is used on some dedicated season pages (e.g., ComedyStreet Staffel 6+)
episode_sections = soup.find_all('section', itemprop='episode')
for i, section in enumerate(episode_sections, 1):
episode = self._parse_episode_section(section, i)
if episode:
episodes.append(episode)
# After collecting all episodes, handle duplicate episode_codes
# This ensures ALL episodes with the same code get suffixes (a, b, c, etc.)
episodes = self._fix_duplicate_episode_codes(episodes)
return episodes
def _parse_episode_h3(self, h3) -> Optional[ScrapedEpisode]:
"""
Parse an episode from an H3 header
Format: "1. Episode Title (Original Title)"
The parent section contains date information in text format
"""
title_text = h3.get_text(strip=True)
# Extract episode_number (overall series number) from H3 text
# Example: "111. Episode Title" -> episode_number = 111
# Example: "Folge 42" -> episode_number = 42
episode_number = None
number_match = re.match(r'^(\d+)[a-z]?\.', title_text)
if not number_match:
# Try "Folge X" format
number_match = re.match(r'^Folge\s+(\d+)', title_text)
if number_match:
try:
episode_number = int(number_match.group(1))
except ValueError:
pass
# Extract title (everything after the number or "Folge X")
title_match = re.match(r'^\d+[a-z]?\.?\s*(.+)', title_text)
if not title_match:
# Try "Folge X Title" format
title_match = re.match(r'^Folge\s+\d+\s*(.+)?', title_text)
if title_match:
title = title_match.group(1).strip() if title_match.group(1) else title_text
else:
title = title_text
# Remove English title in parentheses if present
# Example: "Gewöhnliche Leute(Nosedive)" -> "Gewöhnliche Leute"
# Example: "White Christmas" -> "White Christmas" (no change if no German title)
title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip()
# Get the parent section that contains date information
section = h3.find_parent('section')
if not section:
# Fallback: just return episode without dates
return ScrapedEpisode(
episode_code="00",
title=title,
episode_number=episode_number
)
# Extract episode_code (season-specific episode number) and episode_id from episode link
# Format: /folgen/12x01-title-1828679 or /folgen/01-title-1828679 (for specials)
episode_code = None
episode_id = None
episode_link = section.find('a', href=re.compile(r'/folgen/'))
if episode_link:
href = episode_link.get('href', '')
# Extract episode_id (last number in URL)
episode_id_match = re.search(r'-(\d+)$', href)
if episode_id_match:
episode_id = episode_id_match.group(1)
# Try format: /folgen/12x01-... (regular episodes)
season_episode_match = re.search(r'/folgen/(\d+)x(\d+)', href)
if season_episode_match:
episode_code = season_episode_match.group(2).zfill(2)
else:
# Try format: /folgen/01-... (specials without season prefix)
special_match = re.search(r'/folgen/(\d+)-', href)
if special_match:
episode_code = special_match.group(1).zfill(2)
# Fallback: extract from H3 text if link extraction failed
# Examples: "0.01 Title" -> "01", "1. Title" -> "01", "12a. Title" -> "12a"
if not episode_code:
# Try to find pattern like "X.YY" in title text (e.g., "0.01")
decimal_match = re.match(r'^\d+\.(\d+[a-z]?)', title_text)
if decimal_match:
ep_num = decimal_match.group(1)
if ep_num.isdigit():
episode_code = ep_num.zfill(2)
else:
# Handle cases like "12a"
episode_code = ep_num[:-1].zfill(2) + ep_num[-1] if len(ep_num) >= 2 else ep_num.zfill(2)
else:
# Last resort: use the episode number from start of H3
episode_code = self.extract_episode_code(title_text)
# Extract dates from the section text
section_text = section.get_text()
dates = self._extract_dates_from_text(section_text)
return ScrapedEpisode(
episode_code=episode_code,
title=title,
episode_number=episode_number,
episode_id=episode_id,
date_de_tv=dates.get('de_tv'),
date_de_streaming=dates.get('de_streaming'),
date_de_home_media=dates.get('de_home_media'),
date_de_sync=dates.get('de_sync'),
date_original=dates.get('original')
)
def _parse_episode_section(self, section, fallback_number: int) -> Optional[ScrapedEpisode]:
"""
Parse an episode from a section element with itemprop="episode"
Used on dedicated season pages with alternative layout (e.g., ComedyStreet Staffel 1-5)
Args:
section: BeautifulSoup section element
fallback_number: Episode number to use if can't extract from URL
"""
# Extract title from itemprop="name"
title_elem = section.find(itemprop='name')
if not title_elem:
return None
title = title_elem.get_text(strip=True)
# Remove English title in parentheses if present
title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip()
# Try to extract episode info from URL: href="/series/folgen/1x01-title-1828679"
url_elem = section.find('a', itemprop='url')
episode_code = None
episode_number = None
episode_id = None
if url_elem:
href = url_elem.get('href', '')
# Extract episode_id (last number in URL)
episode_id_match = re.search(r'-(\d+)$', href)
if episode_id_match:
episode_id = episode_id_match.group(1)
# Pattern: /folgen/1x01-... or /folgen/SxE-...
match = re.search(r'/folgen/(\d+)x(\d+)', href)
if match:
episode_code = match.group(2).zfill(2)
# Try to extract episode_number from <div role="cell" itemprop="episodeNumber">
# Structure on overview page: <div content="1" itemprop="episodeNumber" role="cell">01</div>
# The content attribute contains the overall episode number
cell_div = section.find('div', itemprop='episodeNumber')
if cell_div:
# Extract from content attribute
content_attr = cell_div.get('content')
if content_attr:
try:
episode_number = int(content_attr)
except (ValueError, TypeError):
pass
# Fallback: use sequential numbering
if not episode_code:
episode_code = str(fallback_number).zfill(2)
# Extract dates
section_text = section.get_text()
dates = self._extract_dates_from_text(section_text)
return ScrapedEpisode(
episode_code=episode_code,
title=title,
episode_number=episode_number,
episode_id=episode_id,
date_de_tv=dates.get('de_tv'),
date_de_streaming=dates.get('de_streaming'),
date_de_home_media=dates.get('de_home_media'),
date_de_sync=dates.get('de_sync'),
date_original=dates.get('original')
)
def _parse_episode_row(self, row, fallback_number: int) -> Optional[ScrapedEpisode]:
"""
Parse an episode from a row element (<a role="row" itemprop="episode">)
Used on overview pages with table format
Args:
row: BeautifulSoup <a> element with role="row"
fallback_number: Episode number to use if can't extract
"""
# Extract episode_id from row href (e.g., /series/folgen/1x01-title-1828679)
episode_id = None
href = row.get('href', '')
if href:
episode_id_match = re.search(r'-(\d+)$', href)
if episode_id_match:
episode_id = episode_id_match.group(1)
# Get all cells in the row
cells = row.find_all('div', role='cell')
if len(cells) < 7:
return None
# Cell structure based on test output:
# Cell 2: Contains overall episode number (before span) + season.episode (in span)
# Cell 5: Contains episode_code with itemprop="episodeNumber"
# Cell 7: Contains title with itemprop="name"
# Cell 8: Contains date
# Extract overall episode_number from cell 2
episode_number = None
if len(cells) >= 2:
cell2 = cells[1] # Index 1 = cell 2
# Structure: <div role="cell">1<span class="episodenliste-schmal"><b>1.01</b></span></div>
# Extract the number before the span
cell2_text = ''
for child in cell2.children:
if isinstance(child, str):
cell2_text += child.strip()
else:
# Stop at first tag (the span)
break
if cell2_text:
try:
episode_number = int(cell2_text)
except ValueError:
pass
# Extract episode_code from cell 5 (itemprop="episodeNumber")
episode_code = None
ep_code_cell = row.find('div', itemprop='episodeNumber')
if ep_code_cell:
code_text = ep_code_cell.get_text(strip=True)
if code_text:
episode_code = code_text.zfill(2)
# Fallback for episode_code
if not episode_code:
episode_code = str(fallback_number).zfill(2)
# Extract title from cell 7
title = ""
title_cell = row.find('div', class_='episodenliste-2019-episodentitel')
if title_cell:
title_elem = title_cell.find(itemprop='name')
if title_elem:
title = title_elem.get_text(strip=True)
# Remove English title in parentheses if present
title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip()
# Extract date from cell 8 (simple date text)
date_de_tv = None
if len(cells) >= 8:
date_cell = cells[7] # Index 7 = cell 8
date_text = date_cell.get_text(strip=True)
if date_text:
date_de_tv = self.parse_german_date(date_text)
return ScrapedEpisode(
episode_code=episode_code,
title=title,
episode_number=episode_number,
episode_id=episode_id,
date_de_tv=date_de_tv,
date_de_streaming=None,
date_de_home_media=None,
date_de_sync=None,
date_original=None
)
def _extract_dates_from_text(self, text: str) -> Dict[str, Optional[datetime]]:
"""
Extract dates from plain text containing German date labels
Expected format:
- "Deutsche TV-Premiere Mi. 11.12.2013 RTL Crime"
- "Deutsche Streaming-Premiere Fr. 21.10.2016 Netflix"
- "Deutsche Home-Media-Premiere Do. 21.11.2024"
- "Original-TV-Premiere So. 04.12.2011 Channel 4"
- "Premiere der deutschen Synchronfassung ..."
"""
dates = {
'de_tv': None,
'de_streaming': None,
'de_home_media': None,
'de_sync': None,
'original': None
}
# Search for "Deutsche TV-Premiere" followed by a date
match = re.search(r'Deutsche\s+TV-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
if match:
dates['de_tv'] = self.parse_german_date(match.group(1))
# Search for "Deutsche Streaming-Premiere"
match = re.search(r'Deutsche\s+Streaming-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
if match:
dates['de_streaming'] = self.parse_german_date(match.group(1))
# Search for "Deutsche Home-Media-Premiere"
match = re.search(r'Deutsche\s+Home-Media-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
if match:
dates['de_home_media'] = self.parse_german_date(match.group(1))
# Search for "Premiere der deutschen Synchronfassung"
match = re.search(r'Premiere\s+der\s+deutschen\s+Synchronfassung\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
if match:
dates['de_sync'] = self.parse_german_date(match.group(1))
# Search for "Original-TV-Premiere" or "Original-Streaming-Premiere"
match = re.search(r'Original-(?:TV|Streaming)-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
if match:
dates['original'] = self.parse_german_date(match.group(1))
return dates
def main():
"""Test the scraper"""
if len(sys.argv) < 2:
print("Usage: python fernsehserien_scraper.py <url>")
print("Example: python fernsehserien_scraper.py https://www.fernsehserien.de/black-mirror/episodenguide")
sys.exit(1)
url = sys.argv[1]
scraper = FernsehserienScraper()
title, seasons = scraper.scrape_series(url)
print(f"\n=== {title} ===")
print(f"Staffeln gesamt: {len(seasons)}\n")
for season in seasons:
print(f"{season.name} ({season.season_type.value}): {len(season.episodes)} Episoden")
for ep in season.episodes[:3]: # Show first 3
dates = []
if ep.date_original:
dates.append(f"Orig: {ep.date_original.strftime('%d.%m.%Y')}")
if ep.date_de_tv:
dates.append(f"DE: {ep.date_de_tv.strftime('%d.%m.%Y')}")
date_str = ", ".join(dates) if dates else "Keine Daten"
print(f" {ep.episode_code}. {ep.title} ({date_str})")
if len(season.episodes) > 3:
print(f" ... und {len(season.episodes) - 3} weitere")
print()
if __name__ == "__main__":
main()