#!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.11" # dependencies = [ # "requests", # "beautifulsoup4", # "lxml", # ] # /// """ Scraper for fernsehserien.de using BeautifulSoup This is a standalone scraper that can be used independently """ import re import requests from bs4 import BeautifulSoup from typing import List, Dict, Optional, Tuple from datetime import datetime from dataclasses import dataclass import sys from pathlib import Path # Add parent to path for imports sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from serien_checker.database.models import SeasonType from serien_checker.scraper.browser_scraper import ScrapedEpisode, ScrapedSeason from serien_checker.utils.logger import setup_logger logger = setup_logger() class FernsehserienScraper: """ Scraper for fernsehserien.de using requests + BeautifulSoup """ BASE_URL = "https://www.fernsehserien.de" HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } def __init__(self): self.session = requests.Session() self.session.headers.update(self.HEADERS) @staticmethod def parse_german_date(date_str: str) -> Optional[datetime]: """ Parse German date format to datetime Supports formats: - DD.MM.YYYY - DD.MM.YY - YYYY """ if not date_str or date_str.strip() == "": return None date_str = date_str.strip() # Try DD.MM.YYYY or DD.MM.YY patterns = [ (r'(\d{1,2})\.(\d{1,2})\.(\d{4})', '%d.%m.%Y'), (r'(\d{1,2})\.(\d{1,2})\.(\d{2})', '%d.%m.%y'), ] for pattern, fmt in patterns: match = re.search(pattern, date_str) if match: try: return datetime.strptime(match.group(0), fmt) except ValueError: continue # Try just year (YYYY) year_match = re.search(r'\b(19\d{2}|20\d{2})\b', date_str) if year_match: try: return datetime(int(year_match.group(1)), 1, 1) except ValueError: pass return None @staticmethod def classify_season_type(season_name: str) -> SeasonType: """Classify season type based on name""" name_lower = season_name.lower() if any(keyword in name_lower for keyword in ['special', 'specials']): return SeasonType.SPECIALS if any(keyword in name_lower for keyword in ['extra', 'extras', 'bonus']): return SeasonType.EXTRAS if any(keyword in name_lower for keyword in ['best', 'best-of', 'best of']): return SeasonType.BEST_OF if re.match(r'^(19|20)\d{2}$', season_name.strip()): return SeasonType.YEAR_BASED return SeasonType.NORMAL @staticmethod def extract_episode_code(episode_text: str) -> str: """ Extract episode code from text Examples: "1. Folge" -> "01", "12a. Teil A" -> "12a" """ match = re.search(r'^(\d+[a-z]?)\.', episode_text.strip()) if match: code = match.group(1) if code.isdigit(): return code.zfill(2) elif len(code) >= 2 and code[:-1].isdigit(): return code[:-1].zfill(2) + code[-1] return "00" def scrape_series(self, url: str) -> Tuple[str, List[ScrapedSeason]]: """ Scrape series from fernsehserien.de This scraper works in two steps: 1. Scrape overview page to get season links 2. Scrape each season page to get episodes Args: url: URL to episode guide (overview page) Returns: Tuple of (series_title, list of ScrapedSeason) """ print(f"Scraping overview {url}...") response = self.session.get(url, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.content, 'lxml') # Extract series title series_title = self._extract_series_title(soup) print(f"Serie: {series_title}") # Find season links from the series menu season_links = self._extract_season_links(soup, url) print(f"Gefunden: {len(season_links)} Staffeln mit eigenen Seiten") # Check if most season names contain "bisher X Folgen" pattern # If so, we need to group episodes by code prefix instead bisher_pattern = re.compile(r'bisher.*\d+.*Folgen', re.IGNORECASE) has_bisher = [bool(bisher_pattern.search(name)) for name, _ in season_links] bisher_count = sum(has_bisher) mostly_bisher = bisher_count > len(season_links) // 2 and season_links if mostly_bisher: # Special case: All seasons have same name (like "bisher 1369 Folgen") # Extract and group episodes from overview page by episode code prefix print(f"Alle Staffeln heißen '{season_links[0][0]}' - gruppiere nach Episode-Code") seasons = self._extract_grouped_from_overview(soup) print(f"Gefunden: {len(seasons)} gruppierte Staffeln mit insgesamt {sum(len(s.episodes) for s in seasons)} Episoden") return series_title, seasons # Scrape each season from dedicated pages # First, check if any season names have duplicates (same base number) season_names = [name for name, _ in season_links] has_duplicate_seasons = False for name in season_names: # Extract base season number (e.g., "Staffel 6" from "Staffel 6: Video-Podcast") base_match = re.search(r'Staffel\s+(\d+)', name) if base_match: base_num = base_match.group(1) # Count how many seasons have this base number count = sum(1 for n in season_names if f'Staffel {base_num}' in n) if count > 1: has_duplicate_seasons = True break seasons = [] for i, (season_name, season_url) in enumerate(season_links): print(f" Lade {season_name}...") season = self._scrape_season_page(season_name, season_url, i) # If season has no episodes or episodes have no episode_number, # try to extract from overview page instead if season and season.episodes: # Check if any episode has episode_number has_episode_numbers = any(ep.episode_number is not None for ep in season.episodes) # Don't use overview page if there are duplicate season numbers # (e.g., "Staffel 6" and "Staffel 6: Video-Podcast") # because the overview page can't distinguish between variants with same base number skip_overview = has_duplicate_seasons # Also check if episode_numbers look wrong (e.g., starting at 1 for each season) # This happens when scraping from
which only has # season-relative numbers, not overall series numbers needs_overview = False if has_episode_numbers and not skip_overview: # If all episodes have numbers 1, 2, 3... but this isn't Staffel 1, # the numbers are probably wrong (season-relative instead of series-wide) first_ep_num = next((ep.episode_number for ep in season.episodes if ep.episode_number), None) if first_ep_num == 1 and i > 0: # i > 0 means not the first season needs_overview = True if (not has_episode_numbers or needs_overview) and not skip_overview: # Try extracting from overview page (only when safe) overview_season = self._extract_season_from_overview(soup, season_name, i) if overview_season and overview_season.episodes: season = overview_season print(f" {len(season.episodes)} Episoden (von Übersichtsseite)") else: print(f" {len(season.episodes)} Episoden") else: print(f" {len(season.episodes)} Episoden") seasons.append(season) elif season: # Empty season, try overview page (only when safe) if not skip_overview: overview_season = self._extract_season_from_overview(soup, season_name, i) if overview_season and overview_season.episodes: seasons.append(overview_season) print(f" {len(overview_season.episodes)} Episoden (von Übersichtsseite)") # Also check for seasons directly on overview page (e.g., Specials) overview_seasons = self._extract_seasons_from_overview(soup, len(seasons)) if overview_seasons: # Track existing season names to avoid duplicates existing_names = {s.name for s in seasons} new_seasons = [s for s in overview_seasons if s.name not in existing_names] if new_seasons: print(f"Gefunden: {len(new_seasons)} zusätzliche Staffeln auf Übersichtsseite") for season in new_seasons: seasons.append(season) print(f" {season.name}: {len(season.episodes)} Episoden") return series_title, seasons def _extract_series_title(self, soup: BeautifulSoup) -> str: """Extract series title from page""" # Try meta tags first og_title = soup.find('meta', property='og:title') if og_title and og_title.get('content'): title = og_title['content'] # Remove ": Episodenguide" suffix title = re.sub(r':\s*Episodenguide.*$', '', title, flags=re.IGNORECASE) return title.strip() # Fallback to h1 h1 = soup.find('h1') if h1: return h1.get_text(strip=True) return "Unbekannte Serie" def _extract_season_links(self, soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]: """ Extract season links from the series menu Returns: List of (season_name, season_url) tuples """ season_links = [] seen_urls = set() # Collect links from multiple sources links = [] # 1. Try to find the series menu navigation (newer layout) series_menu = soup.find('nav', class_='series-menu') if series_menu: # Find the episodenguide submenu episode_menu = series_menu.find('li', {'data-menu-item': 'episodenguide'}) if episode_menu: # Same pattern as global search - no trailing slash required links.extend(episode_menu.find_all('a', href=re.compile(r'episodenguide/(staffel-[^/]+|\d+)'))) # 2. Search globally for season links (works for pages without series-menu) # Pattern matches: /episodenguide/staffel-1/, /episodenguide/staffel-1/18522, /episodenguide/0/, etc. # Note: No trailing slash required - URLs can end with /staffel-1 or /staffel-1/12345 global_links = soup.find_all('a', href=re.compile(r'episodenguide/(staffel-[^/]+|\d+)')) links.extend(global_links) for link in links: # Extract season name more robustly # First, try to get text from strong/b tags only (ignoring image alt text) strong_tag = link.find(['strong', 'b']) if strong_tag: season_name = strong_tag.get_text(strip=True) else: # Fallback: get direct text children only (exclude nested elements like img) season_name = ''.join(link.find_all(string=True, recursive=False)).strip() # If still empty, use full text if not season_name: season_name = link.get_text(strip=True) # Clean up image captions that might leak through season_name = re.sub(r'Bild:\s*[^A-Z]*(?=[A-Z])', '', season_name) season_name = re.sub(r'Foto:\s*[^A-Z]*(?=[A-Z])', '', season_name) # Normalize whitespace (convert "Staffel6" or "Staffel 6" to "Staffel 6") season_name = ' '.join(season_name.split()) season_url = link.get('href', '') # If season name is just "Staffel" without number, try to extract from URL if season_name.lower() in ['staffel', 'season']: # Try to extract season number from URL like "staffel-6/47453" url_match = re.search(r'/staffel-(\d+)', season_url) if url_match: season_num = url_match.group(1) season_name = f"Staffel {season_num}" logger.debug(f"Added season number from URL: '{season_name}'") logger.debug(f"Extracted season name: '{season_name}' from link {season_url}") # Skip navigation/anchor links if not season_url or season_url.startswith('#'): continue if season_url: # Make absolute URL if season_url.startswith('/'): season_url = self.BASE_URL + season_url elif not season_url.startswith('http'): # Relative URL like "episodenguide/0/28673" # Need to combine with base URL path from urllib.parse import urljoin season_url = urljoin(base_url, season_url) # Skip duplicates (extract staffel identifier for robust comparison) # This ignores different series slugs (e.g., nachtstreife-2020 vs nachtstreife-2-0) staffel_match = re.search(r'/(staffel-[^/]+/\d+)', season_url) if staffel_match: staffel_identifier = staffel_match.group(1).lower() else: # Fallback to full URL normalization for non-standard URLs staffel_identifier = season_url.lower().rstrip('/') logger.debug(f"Season identifier: '{staffel_identifier}' from {season_url}") if staffel_identifier in seen_urls: logger.debug(f"Skipping duplicate season URL: {season_url}") continue # Skip "Übersicht" link if season_name.lower() in ['übersicht', 'episoden']: continue # Clean up season name: extract year if it's embedded in text # e.g., "Bild: NDR2026" -> "2026" # Look for 4-digit year anywhere in the string (without word boundaries) year_match = re.search(r'(20\d{2}|19\d{2})', season_name) if year_match: year = year_match.group(1) # Check if name starts with image caption (e.g., "Bild: NDR2026") if re.match(r'^(Bild:|Foto:)', season_name, re.IGNORECASE): season_name = year # Handle duplicate season names by adding Teil 2, Teil 3, etc. # This happens when there are multiple seasons with the same name but different URLs # (e.g., "2020" regular episodes vs "2020" specials) existing_names = [name for name, _ in season_links] if season_name in existing_names: # Count how many times this base name already exists (including "Teil X" variants) base_name = season_name count = 1 # Count both the base name and all "Teil X" variants for name in existing_names: if name == base_name or name.startswith(f"{base_name} Teil "): count += 1 original_name = season_name season_name = f"{season_name} Teil {count}" logger.debug(f"Duplicate season name detected: '{original_name}' -> '{season_name}' (URL: {season_url})") seen_urls.add(staffel_identifier) season_links.append((season_name, season_url)) logger.debug(f"Added season: '{season_name}' -> {season_url}") return season_links def _extract_seasons_from_overview(self, soup: BeautifulSoup, start_sort_order: int) -> List[ScrapedSeason]: """ Extract seasons that are shown directly on the overview page (e.g., Specials) Args: soup: BeautifulSoup of overview page start_sort_order: Starting sort order number Returns: List of ScrapedSeason objects """ seasons = [] sort_order = start_sort_order # Find sections with season headers (but no corresponding menu link) # These are typically Specials or other special categories sections = soup.find_all('section') for section in sections: # Look for headers like "Specials", "Extras", etc. header = section.find(['h2', 'h3'], id=re.compile(r'Special|Extra')) if not header: continue season_name = header.get_text(strip=True) # Skip if this is just a navigation element if 'karussell' in season_name.lower(): continue # Extract episodes from this section episodes = self._extract_episodes_from_page(section) if episodes: season_type = self.classify_season_type(season_name) season = ScrapedSeason( name=season_name, season_type=season_type, sort_order=sort_order, episodes=episodes ) seasons.append(season) sort_order += 1 return seasons def _extract_grouped_from_overview(self, soup: BeautifulSoup) -> List[ScrapedSeason]: """ Extract all episodes from overview page and group by episode code prefix. Used for series like "Wer weiß denn sowas?" where all seasons have the same name. Args: soup: BeautifulSoup of overview page Returns: List of ScrapedSeason objects grouped by episode code prefix """ # First, try to find season links to get proper season numbers # Pattern: /episodenguide/11/30583 -> Season 11 season_links = soup.find_all('a', href=re.compile(r'episodenguide/(\d+)/')) season_numbers = {} for link in season_links: href = link.get('href', '') match = re.search(r'episodenguide/(\d+)/', href) if match: season_num = match.group(1) season_numbers[season_num] = True # Find all episode rows all_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'}) # Group episodes by their code prefix (e.g., "1.01" → "1", "2.001" → "2") groups = {} for i, row in enumerate(all_rows, 1): episode = self._parse_episode_row(row, i) if not episode: continue # Extract group from row title attribute (e.g., "1.01 Title" → "1") # This is more reliable than episode_code which might be just "01" row_title = row.get('title', '') title_code_match = re.match(r'^(\d+)\.', row_title) if title_code_match: group_key = title_code_match.group(1) else: # Fallback: try from episode code code_match = re.match(r'^(\d+)[x.]', episode.episode_code) if not code_match: code_match = re.match(r'^(\d+)', episode.episode_code) if code_match: group_key = code_match.group(1) else: group_key = episode.episode_code # Check title for special season indicators (XXL, Quizmarathon, etc.) title_lower = episode.title.lower() if 'xxl' in title_lower: group_key = 'XXL' elif 'quizmarathon' in title_lower: group_key = 'Quizmarathon' if group_key not in groups: groups[group_key] = [] groups[group_key].append(episode) # Convert groups to ScrapedSeason objects seasons = [] # Sort by numeric key (treating group_key as integer when possible) def sort_key(item): group_key, _ = item # Try to convert to int for proper numeric sorting try: if group_key.isdigit(): return (0, int(group_key)) # Numeric groups first else: return (1, group_key) # Non-numeric groups (XXL, etc.) after except: return (1, group_key) for sort_order, (group_key, episode_list) in enumerate(sorted(groups.items(), key=sort_key)): # Fix duplicate episode codes within this group episode_list = self._fix_duplicate_episode_codes(episode_list) # Determine season name and type if group_key == 'XXL': season_name = "Wer weiß denn sowas XXL" season_type = SeasonType.EXTRAS elif group_key == 'Quizmarathon': season_name = "Quizmarathon" season_type = SeasonType.SPECIALS elif group_key == '0': season_name = "Specials" season_type = SeasonType.SPECIALS elif group_key.isdigit() and group_key in season_numbers: # This is a proper season number from the URLs season_name = f"Staffel {int(group_key)}" season_type = SeasonType.NORMAL else: # Fallback: use year or group number first_ep_date = next((ep.date_de_tv for ep in episode_list if ep.date_de_tv), None) if first_ep_date: season_name = str(first_ep_date.year) season_type = SeasonType.YEAR_BASED else: season_name = f"Gruppe {group_key}" season_type = SeasonType.NORMAL season = ScrapedSeason( name=season_name, season_type=season_type, sort_order=sort_order, episodes=episode_list ) seasons.append(season) return seasons def _extract_season_from_overview(self, soup: BeautifulSoup, season_name: str, sort_order: int) -> Optional[ScrapedSeason]: """ Extract episodes for a specific season from the overview page This is used when individual season pages don't have episode_number data Args: soup: BeautifulSoup of overview page season_name: Name of season to extract (e.g., "Staffel 1") sort_order: Sort order number Returns: ScrapedSeason or None """ # Find all episode rows on overview page all_episode_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'}) if not all_episode_rows: return None # Extract episodes and filter by season episodes = [] seen_codes = {} for i, row in enumerate(all_episode_rows, 1): # Parse the episode episode = self._parse_episode_row(row, i) if not episode: continue # Check if episode belongs to this season by looking at the URL href = row.get('href', '') # Skip XXL or special versions if we're looking for plain season # (These should be handled by their own dedicated pages) if 'xxl' in href.lower() and ':' not in season_name.lower(): continue # Extract season number from href like "/comedystreet/folgen/1x01-..." season_match = re.search(r'/folgen/(\d+)x\d+', href) if not season_match: continue # Convert season_name like "Staffel 1" to number # Don't match "Staffel 1: Something" - only plain "Staffel 1" season_num_match = re.search(r'^Staffel\s+(\d+)$', season_name.strip()) if not season_num_match: # Try without "Staffel" prefix - might be just "1", "2", etc. season_num_match = re.search(r'^(\d+)$', season_name.strip()) if not season_num_match: # This is a special season like "Staffel 1: XXL", skip it # Those will be handled by their own dedicated pages continue expected_season_num = int(season_num_match.group(1)) actual_season_num = int(season_match.group(1)) if expected_season_num != actual_season_num: continue # This episode belongs to the requested season episodes.append(episode) if not episodes: return None # Fix duplicate episode codes episodes = self._fix_duplicate_episode_codes(episodes) season_type = self.classify_season_type(season_name) logger.debug(f"Scraped season '{season_name}': {len(episodes)} episodes found") return ScrapedSeason( name=season_name, season_type=season_type, sort_order=sort_order, episodes=episodes ) def _scrape_season_page(self, season_name: str, season_url: str, sort_order: int) -> Optional[ScrapedSeason]: """ Scrape a single season page Args: season_name: Name of the season season_url: URL to season page sort_order: Sort order number Returns: ScrapedSeason or None """ try: response = self.session.get(season_url, timeout=15) response.raise_for_status() except Exception as e: print(f" Fehler beim Laden: {e}") return None soup = BeautifulSoup(response.content, 'lxml') # Extract episodes from this page episodes = self._extract_episodes_from_page(soup) if not episodes: return None season_type = self.classify_season_type(season_name) logger.debug(f"Scraped season '{season_name}': {len(episodes)} episodes found") return ScrapedSeason( name=season_name, season_type=season_type, sort_order=sort_order, episodes=episodes ) def _fix_duplicate_episode_codes(self, episodes: List[ScrapedEpisode]) -> List[ScrapedEpisode]: """ Fix duplicate episode codes by adding letter suffixes (a, b, c, etc.) to ALL episodes that share the same code (including the first occurrence). For example, if three episodes have code "01", they become "01a", "01b", "01c". Args: episodes: List of episodes that may contain duplicates Returns: List of episodes with unique codes """ from collections import Counter # Count how many times each code appears code_counts = Counter(ep.episode_code for ep in episodes) # Track which suffix to use for each code code_suffixes = {} # Process each episode for episode in episodes: original_code = episode.episode_code # If this code appears more than once, add suffix to ALL occurrences if code_counts[original_code] > 1: # Get next suffix for this code (a, b, c, ...) if original_code not in code_suffixes: code_suffixes[original_code] = ord('a') suffix_char = chr(code_suffixes[original_code]) episode.episode_code = f"{original_code}{suffix_char}" code_suffixes[original_code] += 1 return episodes def _extract_episodes_from_page(self, soup: BeautifulSoup) -> List[ScrapedEpisode]: """ Extract episodes from a season page fernsehserien.de uses H3 headers for episode titles within section elements OR section elements with itemprop="episode" (alternative layout) """ episodes = [] # Try method 1: Find all H3 elements that start with a number or "Folge X" all_h3 = soup.find_all('h3') # Accept both "41. Title" and "Folge 42" formats episode_h3s = [h3 for h3 in all_h3 if re.match(r'^(\d+[a-z]?\.|Folge\s+\d+)', h3.get_text(strip=True))] if episode_h3s: # Method 1: H3-based extraction (main overview page with detailed episode info) for h3 in episode_h3s: episode = self._parse_episode_h3(h3) if episode: episodes.append(episode) else: # Method 2: Try overview page table format with # This format has episode_number in the cells episode_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'}) if episode_rows: for i, row in enumerate(episode_rows, 1): episode = self._parse_episode_row(row, i) if episode: episodes.append(episode) else: # Method 3: Try alternative layout with section[itemprop="episode"] # This is used on some dedicated season pages (e.g., ComedyStreet Staffel 6+) episode_sections = soup.find_all('section', itemprop='episode') for i, section in enumerate(episode_sections, 1): episode = self._parse_episode_section(section, i) if episode: episodes.append(episode) # After collecting all episodes, handle duplicate episode_codes # This ensures ALL episodes with the same code get suffixes (a, b, c, etc.) episodes = self._fix_duplicate_episode_codes(episodes) return episodes def _parse_episode_h3(self, h3) -> Optional[ScrapedEpisode]: """ Parse an episode from an H3 header Format: "1. Episode Title (Original Title)" The parent section contains date information in text format """ title_text = h3.get_text(strip=True) # Extract episode_number (overall series number) from H3 text # Example: "111. Episode Title" -> episode_number = 111 # Example: "Folge 42" -> episode_number = 42 episode_number = None number_match = re.match(r'^(\d+)[a-z]?\.', title_text) if not number_match: # Try "Folge X" format number_match = re.match(r'^Folge\s+(\d+)', title_text) if number_match: try: episode_number = int(number_match.group(1)) except ValueError: pass # Extract title (everything after the number or "Folge X") title_match = re.match(r'^\d+[a-z]?\.?\s*(.+)', title_text) if not title_match: # Try "Folge X Title" format title_match = re.match(r'^Folge\s+\d+\s*(.+)?', title_text) if title_match: title = title_match.group(1).strip() if title_match.group(1) else title_text else: title = title_text # Remove English title in parentheses if present # Example: "Gewöhnliche Leute(Nosedive)" -> "Gewöhnliche Leute" # Example: "White Christmas" -> "White Christmas" (no change if no German title) title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip() # Get the parent section that contains date information section = h3.find_parent('section') if not section: # Fallback: just return episode without dates return ScrapedEpisode( episode_code="00", title=title, episode_number=episode_number ) # Extract episode_code (season-specific episode number) and episode_id from episode link # Format: /folgen/12x01-title-1828679 or /folgen/01-title-1828679 (for specials) episode_code = None episode_id = None episode_link = section.find('a', href=re.compile(r'/folgen/')) if episode_link: href = episode_link.get('href', '') # Extract episode_id (last number in URL) episode_id_match = re.search(r'-(\d+)$', href) if episode_id_match: episode_id = episode_id_match.group(1) # Try format: /folgen/12x01-... (regular episodes) season_episode_match = re.search(r'/folgen/(\d+)x(\d+)', href) if season_episode_match: episode_code = season_episode_match.group(2).zfill(2) else: # Try format: /folgen/01-... (specials without season prefix) special_match = re.search(r'/folgen/(\d+)-', href) if special_match: episode_code = special_match.group(1).zfill(2) # Fallback: extract from H3 text if link extraction failed # Examples: "0.01 Title" -> "01", "1. Title" -> "01", "12a. Title" -> "12a" if not episode_code: # Try to find pattern like "X.YY" in title text (e.g., "0.01") decimal_match = re.match(r'^\d+\.(\d+[a-z]?)', title_text) if decimal_match: ep_num = decimal_match.group(1) if ep_num.isdigit(): episode_code = ep_num.zfill(2) else: # Handle cases like "12a" episode_code = ep_num[:-1].zfill(2) + ep_num[-1] if len(ep_num) >= 2 else ep_num.zfill(2) else: # Last resort: use the episode number from start of H3 episode_code = self.extract_episode_code(title_text) # Extract dates from the section text section_text = section.get_text() dates = self._extract_dates_from_text(section_text) return ScrapedEpisode( episode_code=episode_code, title=title, episode_number=episode_number, episode_id=episode_id, date_de_tv=dates.get('de_tv'), date_de_streaming=dates.get('de_streaming'), date_de_home_media=dates.get('de_home_media'), date_de_sync=dates.get('de_sync'), date_original=dates.get('original') ) def _parse_episode_section(self, section, fallback_number: int) -> Optional[ScrapedEpisode]: """ Parse an episode from a section element with itemprop="episode" Used on dedicated season pages with alternative layout (e.g., ComedyStreet Staffel 1-5) Args: section: BeautifulSoup section element fallback_number: Episode number to use if can't extract from URL """ # Extract title from itemprop="name" title_elem = section.find(itemprop='name') if not title_elem: return None title = title_elem.get_text(strip=True) # Remove English title in parentheses if present title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip() # Try to extract episode info from URL: href="/series/folgen/1x01-title-1828679" url_elem = section.find('a', itemprop='url') episode_code = None episode_number = None episode_id = None if url_elem: href = url_elem.get('href', '') # Extract episode_id (last number in URL) episode_id_match = re.search(r'-(\d+)$', href) if episode_id_match: episode_id = episode_id_match.group(1) # Pattern: /folgen/1x01-... or /folgen/SxE-... match = re.search(r'/folgen/(\d+)x(\d+)', href) if match: episode_code = match.group(2).zfill(2) # Try to extract episode_number from
# Structure on overview page:
01
# The content attribute contains the overall episode number cell_div = section.find('div', itemprop='episodeNumber') if cell_div: # Extract from content attribute content_attr = cell_div.get('content') if content_attr: try: episode_number = int(content_attr) except (ValueError, TypeError): pass # Fallback: use sequential numbering if not episode_code: episode_code = str(fallback_number).zfill(2) # Extract dates section_text = section.get_text() dates = self._extract_dates_from_text(section_text) return ScrapedEpisode( episode_code=episode_code, title=title, episode_number=episode_number, episode_id=episode_id, date_de_tv=dates.get('de_tv'), date_de_streaming=dates.get('de_streaming'), date_de_home_media=dates.get('de_home_media'), date_de_sync=dates.get('de_sync'), date_original=dates.get('original') ) def _parse_episode_row(self, row, fallback_number: int) -> Optional[ScrapedEpisode]: """ Parse an episode from a row element (
) Used on overview pages with table format Args: row: BeautifulSoup element with role="row" fallback_number: Episode number to use if can't extract """ # Extract episode_id from row href (e.g., /series/folgen/1x01-title-1828679) episode_id = None href = row.get('href', '') if href: episode_id_match = re.search(r'-(\d+)$', href) if episode_id_match: episode_id = episode_id_match.group(1) # Get all cells in the row cells = row.find_all('div', role='cell') if len(cells) < 7: return None # Cell structure based on test output: # Cell 2: Contains overall episode number (before span) + season.episode (in span) # Cell 5: Contains episode_code with itemprop="episodeNumber" # Cell 7: Contains title with itemprop="name" # Cell 8: Contains date # Extract overall episode_number from cell 2 episode_number = None if len(cells) >= 2: cell2 = cells[1] # Index 1 = cell 2 # Structure:
11.01
# Extract the number before the span cell2_text = '' for child in cell2.children: if isinstance(child, str): cell2_text += child.strip() else: # Stop at first tag (the span) break if cell2_text: try: episode_number = int(cell2_text) except ValueError: pass # Extract episode_code from cell 5 (itemprop="episodeNumber") episode_code = None ep_code_cell = row.find('div', itemprop='episodeNumber') if ep_code_cell: code_text = ep_code_cell.get_text(strip=True) if code_text: episode_code = code_text.zfill(2) # Fallback for episode_code if not episode_code: episode_code = str(fallback_number).zfill(2) # Extract title from cell 7 title = "" title_cell = row.find('div', class_='episodenliste-2019-episodentitel') if title_cell: title_elem = title_cell.find(itemprop='name') if title_elem: title = title_elem.get_text(strip=True) # Remove English title in parentheses if present title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip() # Extract date from cell 8 (simple date text) date_de_tv = None if len(cells) >= 8: date_cell = cells[7] # Index 7 = cell 8 date_text = date_cell.get_text(strip=True) if date_text: date_de_tv = self.parse_german_date(date_text) return ScrapedEpisode( episode_code=episode_code, title=title, episode_number=episode_number, episode_id=episode_id, date_de_tv=date_de_tv, date_de_streaming=None, date_de_home_media=None, date_de_sync=None, date_original=None ) def _extract_dates_from_text(self, text: str) -> Dict[str, Optional[datetime]]: """ Extract dates from plain text containing German date labels Expected format: - "Deutsche TV-Premiere Mi. 11.12.2013 RTL Crime" - "Deutsche Streaming-Premiere Fr. 21.10.2016 Netflix" - "Deutsche Home-Media-Premiere Do. 21.11.2024" - "Original-TV-Premiere So. 04.12.2011 Channel 4" - "Premiere der deutschen Synchronfassung ..." """ dates = { 'de_tv': None, 'de_streaming': None, 'de_home_media': None, 'de_sync': None, 'original': None } # Search for "Deutsche TV-Premiere" followed by a date match = re.search(r'Deutsche\s+TV-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text) if match: dates['de_tv'] = self.parse_german_date(match.group(1)) # Search for "Deutsche Streaming-Premiere" match = re.search(r'Deutsche\s+Streaming-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text) if match: dates['de_streaming'] = self.parse_german_date(match.group(1)) # Search for "Deutsche Home-Media-Premiere" match = re.search(r'Deutsche\s+Home-Media-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text) if match: dates['de_home_media'] = self.parse_german_date(match.group(1)) # Search for "Premiere der deutschen Synchronfassung" match = re.search(r'Premiere\s+der\s+deutschen\s+Synchronfassung\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text) if match: dates['de_sync'] = self.parse_german_date(match.group(1)) # Search for "Original-TV-Premiere" or "Original-Streaming-Premiere" match = re.search(r'Original-(?:TV|Streaming)-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text) if match: dates['original'] = self.parse_german_date(match.group(1)) return dates def main(): """Test the scraper""" if len(sys.argv) < 2: print("Usage: python fernsehserien_scraper.py ") print("Example: python fernsehserien_scraper.py https://www.fernsehserien.de/black-mirror/episodenguide") sys.exit(1) url = sys.argv[1] scraper = FernsehserienScraper() title, seasons = scraper.scrape_series(url) print(f"\n=== {title} ===") print(f"Staffeln gesamt: {len(seasons)}\n") for season in seasons: print(f"{season.name} ({season.season_type.value}): {len(season.episodes)} Episoden") for ep in season.episodes[:3]: # Show first 3 dates = [] if ep.date_original: dates.append(f"Orig: {ep.date_original.strftime('%d.%m.%Y')}") if ep.date_de_tv: dates.append(f"DE: {ep.date_de_tv.strftime('%d.%m.%Y')}") date_str = ", ".join(dates) if dates else "Keine Daten" print(f" {ep.episode_code}. {ep.title} ({date_str})") if len(season.episodes) > 3: print(f" ... und {len(season.episodes) - 3} weitere") print() if __name__ == "__main__": main()