Serien-Checker/serien_checker/scraper/fernsehserien_scraper.py

#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
#     "requests",
#     "beautifulsoup4",
#     "lxml",
# ]
# ///

"""
Scraper for fernsehserien.de using BeautifulSoup
This is a standalone scraper that can be used independently
"""

import re
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Optional, Tuple
from datetime import datetime
from dataclasses import dataclass
import sys
from pathlib import Path

# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from serien_checker.database.models import SeasonType
from serien_checker.scraper.browser_scraper import ScrapedEpisode, ScrapedSeason
from serien_checker.utils.logger import setup_logger

logger = setup_logger()


class FernsehserienScraper:
    """
    Scraper for fernsehserien.de using requests + BeautifulSoup
    """

    BASE_URL = "https://www.fernsehserien.de"
    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(self.HEADERS)

    @staticmethod
    def parse_german_date(date_str: str) -> Optional[datetime]:
        """
        Parse German date format to datetime

        Supports formats:
        - DD.MM.YYYY
        - DD.MM.YY
        - YYYY
        """
        if not date_str or date_str.strip() == "":
            return None

        date_str = date_str.strip()

        # Try DD.MM.YYYY or DD.MM.YY
        patterns = [
            (r'(\d{1,2})\.(\d{1,2})\.(\d{4})', '%d.%m.%Y'),
            (r'(\d{1,2})\.(\d{1,2})\.(\d{2})', '%d.%m.%y'),
        ]

        for pattern, fmt in patterns:
            match = re.search(pattern, date_str)
            if match:
                try:
                    return datetime.strptime(match.group(0), fmt)
                except ValueError:
                    continue

        # Try just year (YYYY)
        year_match = re.search(r'\b(19\d{2}|20\d{2})\b', date_str)
        if year_match:
            try:
                return datetime(int(year_match.group(1)), 1, 1)
            except ValueError:
                pass

        return None

    @staticmethod
    def classify_season_type(season_name: str) -> SeasonType:
        """Classify season type based on name"""
        name_lower = season_name.lower()

        if any(keyword in name_lower for keyword in ['special', 'specials']):
            return SeasonType.SPECIALS

        if any(keyword in name_lower for keyword in ['extra', 'extras', 'bonus']):
            return SeasonType.EXTRAS

        if any(keyword in name_lower for keyword in ['best', 'best-of', 'best of']):
            return SeasonType.BEST_OF

        if re.match(r'^(19|20)\d{2}$', season_name.strip()):
            return SeasonType.YEAR_BASED

        return SeasonType.NORMAL

    @staticmethod
    def extract_episode_code(episode_text: str) -> str:
        """
        Extract episode code from text
        Examples: "1. Folge" -> "01", "12a. Teil A" -> "12a"
        """
        match = re.search(r'^(\d+[a-z]?)\.', episode_text.strip())
        if match:
            code = match.group(1)
            if code.isdigit():
                return code.zfill(2)
            elif len(code) >= 2 and code[:-1].isdigit():
                return code[:-1].zfill(2) + code[-1]
        return "00"

    def scrape_series(self, url: str) -> Tuple[str, List[ScrapedSeason]]:
        """
        Scrape series from fernsehserien.de

        This scraper works in two steps:
        1. Scrape overview page to get season links
        2. Scrape each season page to get episodes

        Args:
            url: URL to episode guide (overview page)

        Returns:
            Tuple of (series_title, list of ScrapedSeason)
        """
        print(f"Scraping overview {url}...")

        response = self.session.get(url, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'lxml')

        # Extract series title
        series_title = self._extract_series_title(soup)
        print(f"Serie: {series_title}")

        # Find season links from the series menu
        season_links = self._extract_season_links(soup, url)
        print(f"Gefunden: {len(season_links)} Staffeln mit eigenen Seiten")

        # Check if most season names contain "bisher X Folgen" pattern
        # If so, we need to group episodes by code prefix instead
        bisher_pattern = re.compile(r'bisher.*\d+.*Folgen', re.IGNORECASE)
        has_bisher = [bool(bisher_pattern.search(name)) for name, _ in season_links]
        bisher_count = sum(has_bisher)
        mostly_bisher = bisher_count > len(season_links) // 2 and season_links

        if mostly_bisher:
            # Special case: All seasons have same name (like "bisher 1369 Folgen")
            # Extract and group episodes from overview page by episode code prefix
            print(f"Alle Staffeln heißen '{season_links[0][0]}' - gruppiere nach Episode-Code")
            seasons = self._extract_grouped_from_overview(soup)
            print(f"Gefunden: {len(seasons)} gruppierte Staffeln mit insgesamt {sum(len(s.episodes) for s in seasons)} Episoden")
            return series_title, seasons

        # Scrape each season from dedicated pages
        # First, check if any season names have duplicates (same base number)
        season_names = [name for name, _ in season_links]
        has_duplicate_seasons = False
        for name in season_names:
            # Extract base season number (e.g., "Staffel 6" from "Staffel 6: Video-Podcast")
            base_match = re.search(r'Staffel\s+(\d+)', name)
            if base_match:
                base_num = base_match.group(1)
                # Count how many seasons have this base number
                count = sum(1 for n in season_names if f'Staffel {base_num}' in n)
                if count > 1:
                    has_duplicate_seasons = True
                    break

        seasons = []
        for i, (season_name, season_url) in enumerate(season_links):
            print(f"  Lade {season_name}...")
            season = self._scrape_season_page(season_name, season_url, i)

            # If season has no episodes or episodes have no episode_number,
            # try to extract from overview page instead
            if season and season.episodes:
                # Check if any episode has episode_number
                has_episode_numbers = any(ep.episode_number is not None for ep in season.episodes)

                # Don't use overview page if there are duplicate season numbers
                # (e.g., "Staffel 6" and "Staffel 6: Video-Podcast")
                # because the overview page can't distinguish between variants with same base number
                skip_overview = has_duplicate_seasons

                # Also check if episode_numbers look wrong (e.g., starting at 1 for each season)
                # This happens when scraping from <section itemprop="episode"> which only has
                # season-relative numbers, not overall series numbers
                needs_overview = False
                if has_episode_numbers and not skip_overview:
                    # If all episodes have numbers 1, 2, 3... but this isn't Staffel 1,
                    # the numbers are probably wrong (season-relative instead of series-wide)
                    first_ep_num = next((ep.episode_number for ep in season.episodes if ep.episode_number), None)
                    if first_ep_num == 1 and i > 0:  # i > 0 means not the first season
                        needs_overview = True

                if (not has_episode_numbers or needs_overview) and not skip_overview:
                    # Try extracting from overview page (only when safe)
                    overview_season = self._extract_season_from_overview(soup, season_name, i)
                    if overview_season and overview_season.episodes:
                        season = overview_season
                        print(f"    {len(season.episodes)} Episoden (von Übersichtsseite)")
                    else:
                        print(f"    {len(season.episodes)} Episoden")
                else:
                    print(f"    {len(season.episodes)} Episoden")
                seasons.append(season)
            elif season:
                # Empty season, try overview page (only when safe)
                if not skip_overview:
                    overview_season = self._extract_season_from_overview(soup, season_name, i)
                    if overview_season and overview_season.episodes:
                        seasons.append(overview_season)
                        print(f"    {len(overview_season.episodes)} Episoden (von Übersichtsseite)")

        # Also check for seasons directly on overview page (e.g., Specials)
        overview_seasons = self._extract_seasons_from_overview(soup, len(seasons))
        if overview_seasons:
            # Track existing season names to avoid duplicates
            existing_names = {s.name for s in seasons}
            new_seasons = [s for s in overview_seasons if s.name not in existing_names]

            if new_seasons:
                print(f"Gefunden: {len(new_seasons)} zusätzliche Staffeln auf Übersichtsseite")
                for season in new_seasons:
                    seasons.append(season)
                    print(f"  {season.name}: {len(season.episodes)} Episoden")

        return series_title, seasons

    def _extract_series_title(self, soup: BeautifulSoup) -> str:
        """Extract series title from page"""
        # Try meta tags first
        og_title = soup.find('meta', property='og:title')
        if og_title and og_title.get('content'):
            title = og_title['content']
            # Remove ": Episodenguide" suffix
            title = re.sub(r':\s*Episodenguide.*$', '', title, flags=re.IGNORECASE)
            return title.strip()

        # Fallback to h1
        h1 = soup.find('h1')
        if h1:
            return h1.get_text(strip=True)

        return "Unbekannte Serie"

    def _extract_season_links(self, soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]:
        """
        Extract season links from the series menu

        Returns:
            List of (season_name, season_url) tuples
        """
        season_links = []
        seen_urls = set()

        # Collect links from multiple sources
        links = []

        # 1. Try to find the series menu navigation (newer layout)
        series_menu = soup.find('nav', class_='series-menu')
        if series_menu:
            # Find the episodenguide submenu
            episode_menu = series_menu.find('li', {'data-menu-item': 'episodenguide'})
            if episode_menu:
                # Same pattern as global search - no trailing slash required
                links.extend(episode_menu.find_all('a', href=re.compile(r'episodenguide/(staffel-[^/]+|\d+)')))

        # 2. Search globally for season links (works for pages without series-menu)
        # Pattern matches: /episodenguide/staffel-1/, /episodenguide/staffel-1/18522, /episodenguide/0/, etc.
        # Note: No trailing slash required - URLs can end with /staffel-1 or /staffel-1/12345
        global_links = soup.find_all('a', href=re.compile(r'episodenguide/(staffel-[^/]+|\d+)'))
        links.extend(global_links)

        for link in links:
            # Extract season name more robustly
            # First, try to get text from strong/b tags only (ignoring image alt text)
            strong_tag = link.find(['strong', 'b'])
            if strong_tag:
                season_name = strong_tag.get_text(strip=True)
            else:
                # Fallback: get direct text children only (exclude nested elements like img)
                season_name = ''.join(link.find_all(string=True, recursive=False)).strip()
                # If still empty, use full text
                if not season_name:
                    season_name = link.get_text(strip=True)

            # Clean up image captions that might leak through
            season_name = re.sub(r'Bild:\s*[^A-Z]*(?=[A-Z])', '', season_name)
            season_name = re.sub(r'Foto:\s*[^A-Z]*(?=[A-Z])', '', season_name)

            # Normalize whitespace (convert "Staffel6" or "Staffel  6" to "Staffel 6")
            season_name = ' '.join(season_name.split())

            season_url = link.get('href', '')

            # If season name is just "Staffel" without number, try to extract from URL
            if season_name.lower() in ['staffel', 'season']:
                # Try to extract season number from URL like "staffel-6/47453"
                url_match = re.search(r'/staffel-(\d+)', season_url)
                if url_match:
                    season_num = url_match.group(1)
                    season_name = f"Staffel {season_num}"
                    logger.debug(f"Added season number from URL: '{season_name}'")

            logger.debug(f"Extracted season name: '{season_name}' from link {season_url}")

            # Skip navigation/anchor links
            if not season_url or season_url.startswith('#'):
                continue

            if season_url:
                # Make absolute URL
                if season_url.startswith('/'):
                    season_url = self.BASE_URL + season_url
                elif not season_url.startswith('http'):
                    # Relative URL like "episodenguide/0/28673"
                    # Need to combine with base URL path
                    from urllib.parse import urljoin
                    season_url = urljoin(base_url, season_url)

                # Skip duplicates (extract staffel identifier for robust comparison)
                # This ignores different series slugs (e.g., nachtstreife-2020 vs nachtstreife-2-0)
                staffel_match = re.search(r'/(staffel-[^/]+/\d+)', season_url)
                if staffel_match:
                    staffel_identifier = staffel_match.group(1).lower()
                else:
                    # Fallback to full URL normalization for non-standard URLs
                    staffel_identifier = season_url.lower().rstrip('/')

                logger.debug(f"Season identifier: '{staffel_identifier}' from {season_url}")

                if staffel_identifier in seen_urls:
                    logger.debug(f"Skipping duplicate season URL: {season_url}")
                    continue

                # Skip "Übersicht" link
                if season_name.lower() in ['übersicht', 'episoden']:
                    continue

                # Clean up season name: extract year if it's embedded in text
                # e.g., "Bild: NDR2026" -> "2026"
                # Look for 4-digit year anywhere in the string (without word boundaries)
                year_match = re.search(r'(20\d{2}|19\d{2})', season_name)
                if year_match:
                    year = year_match.group(1)
                    # Check if name starts with image caption (e.g., "Bild: NDR2026")
                    if re.match(r'^(Bild:|Foto:)', season_name, re.IGNORECASE):
                        season_name = year

                # Handle duplicate season names by adding Teil 2, Teil 3, etc.
                # This happens when there are multiple seasons with the same name but different URLs
                # (e.g., "2020" regular episodes vs "2020" specials)
                existing_names = [name for name, _ in season_links]
                if season_name in existing_names:
                    # Count how many times this base name already exists (including "Teil X" variants)
                    base_name = season_name
                    count = 1
                    # Count both the base name and all "Teil X" variants
                    for name in existing_names:
                        if name == base_name or name.startswith(f"{base_name} Teil "):
                            count += 1
                    original_name = season_name
                    season_name = f"{season_name} Teil {count}"
                    logger.debug(f"Duplicate season name detected: '{original_name}' -> '{season_name}' (URL: {season_url})")

                seen_urls.add(staffel_identifier)
                season_links.append((season_name, season_url))
                logger.debug(f"Added season: '{season_name}' -> {season_url}")

        return season_links

    def _extract_seasons_from_overview(self, soup: BeautifulSoup, start_sort_order: int) -> List[ScrapedSeason]:
        """
        Extract seasons that are shown directly on the overview page (e.g., Specials)

        Args:
            soup: BeautifulSoup of overview page
            start_sort_order: Starting sort order number

        Returns:
            List of ScrapedSeason objects
        """
        seasons = []
        sort_order = start_sort_order

        # Find sections with season headers (but no corresponding menu link)
        # These are typically Specials or other special categories
        sections = soup.find_all('section')

        for section in sections:
            # Look for headers like "Specials", "Extras", etc.
            header = section.find(['h2', 'h3'], id=re.compile(r'Special|Extra'))

            if not header:
                continue

            season_name = header.get_text(strip=True)

            # Skip if this is just a navigation element
            if 'karussell' in season_name.lower():
                continue

            # Extract episodes from this section
            episodes = self._extract_episodes_from_page(section)

            if episodes:
                season_type = self.classify_season_type(season_name)

                season = ScrapedSeason(
                    name=season_name,
                    season_type=season_type,
                    sort_order=sort_order,
                    episodes=episodes
                )
                seasons.append(season)
                sort_order += 1

        return seasons

    def _extract_grouped_from_overview(self, soup: BeautifulSoup) -> List[ScrapedSeason]:
        """
        Extract all episodes from overview page and group by episode code prefix.
        Used for series like "Wer weiß denn sowas?" where all seasons have the same name.

        Args:
            soup: BeautifulSoup of overview page

        Returns:
            List of ScrapedSeason objects grouped by episode code prefix
        """
        # First, try to find season links to get proper season numbers
        # Pattern: /episodenguide/11/30583 -> Season 11
        season_links = soup.find_all('a', href=re.compile(r'episodenguide/(\d+)/'))
        season_numbers = {}
        for link in season_links:
            href = link.get('href', '')
            match = re.search(r'episodenguide/(\d+)/', href)
            if match:
                season_num = match.group(1)
                season_numbers[season_num] = True

        # Find all episode rows
        all_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'})

        # Group episodes by their code prefix (e.g., "1.01" → "1", "2.001" → "2")
        groups = {}

        for i, row in enumerate(all_rows, 1):
            episode = self._parse_episode_row(row, i)
            if not episode:
                continue

            # Extract group from row title attribute (e.g., "1.01 Title" → "1")
            # This is more reliable than episode_code which might be just "01"
            row_title = row.get('title', '')
            title_code_match = re.match(r'^(\d+)\.', row_title)

            if title_code_match:
                group_key = title_code_match.group(1)
            else:
                # Fallback: try from episode code
                code_match = re.match(r'^(\d+)[x.]', episode.episode_code)
                if not code_match:
                    code_match = re.match(r'^(\d+)', episode.episode_code)

                if code_match:
                    group_key = code_match.group(1)
                else:
                    group_key = episode.episode_code

            # Check title for special season indicators (XXL, Quizmarathon, etc.)
            title_lower = episode.title.lower()
            if 'xxl' in title_lower:
                group_key = 'XXL'
            elif 'quizmarathon' in title_lower:
                group_key = 'Quizmarathon'

            if group_key not in groups:
                groups[group_key] = []

            groups[group_key].append(episode)

        # Convert groups to ScrapedSeason objects
        seasons = []

        # Sort by numeric key (treating group_key as integer when possible)
        def sort_key(item):
            group_key, _ = item
            # Try to convert to int for proper numeric sorting
            try:
                if group_key.isdigit():
                    return (0, int(group_key))  # Numeric groups first
                else:
                    return (1, group_key)  # Non-numeric groups (XXL, etc.) after
            except:
                return (1, group_key)

        for sort_order, (group_key, episode_list) in enumerate(sorted(groups.items(), key=sort_key)):
            # Fix duplicate episode codes within this group
            episode_list = self._fix_duplicate_episode_codes(episode_list)

            # Determine season name and type
            if group_key == 'XXL':
                season_name = "Wer weiß denn sowas XXL"
                season_type = SeasonType.EXTRAS
            elif group_key == 'Quizmarathon':
                season_name = "Quizmarathon"
                season_type = SeasonType.SPECIALS
            elif group_key == '0':
                season_name = "Specials"
                season_type = SeasonType.SPECIALS
            elif group_key.isdigit() and group_key in season_numbers:
                # This is a proper season number from the URLs
                season_name = f"Staffel {int(group_key)}"
                season_type = SeasonType.NORMAL
            else:
                # Fallback: use year or group number
                first_ep_date = next((ep.date_de_tv for ep in episode_list if ep.date_de_tv), None)
                if first_ep_date:
                    season_name = str(first_ep_date.year)
                    season_type = SeasonType.YEAR_BASED
                else:
                    season_name = f"Gruppe {group_key}"
                    season_type = SeasonType.NORMAL

            season = ScrapedSeason(
                name=season_name,
                season_type=season_type,
                sort_order=sort_order,
                episodes=episode_list
            )
            seasons.append(season)

        return seasons

    def _extract_season_from_overview(self, soup: BeautifulSoup, season_name: str, sort_order: int) -> Optional[ScrapedSeason]:
        """
        Extract episodes for a specific season from the overview page
        This is used when individual season pages don't have episode_number data

        Args:
            soup: BeautifulSoup of overview page
            season_name: Name of season to extract (e.g., "Staffel 1")
            sort_order: Sort order number

        Returns:
            ScrapedSeason or None
        """
        # Find all episode rows on overview page
        all_episode_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'})

        if not all_episode_rows:
            return None

        # Extract episodes and filter by season
        episodes = []
        seen_codes = {}

        for i, row in enumerate(all_episode_rows, 1):
            # Parse the episode
            episode = self._parse_episode_row(row, i)

            if not episode:
                continue

            # Check if episode belongs to this season by looking at the URL
            href = row.get('href', '')

            # Skip XXL or special versions if we're looking for plain season
            # (These should be handled by their own dedicated pages)
            if 'xxl' in href.lower() and ':' not in season_name.lower():
                continue

            # Extract season number from href like "/comedystreet/folgen/1x01-..."
            season_match = re.search(r'/folgen/(\d+)x\d+', href)

            if not season_match:
                continue

            # Convert season_name like "Staffel 1" to number
            # Don't match "Staffel 1: Something" - only plain "Staffel 1"
            season_num_match = re.search(r'^Staffel\s+(\d+)$', season_name.strip())
            if not season_num_match:
                # Try without "Staffel" prefix - might be just "1", "2", etc.
                season_num_match = re.search(r'^(\d+)$', season_name.strip())

            if not season_num_match:
                # This is a special season like "Staffel 1: XXL", skip it
                # Those will be handled by their own dedicated pages
                continue

            expected_season_num = int(season_num_match.group(1))
            actual_season_num = int(season_match.group(1))

            if expected_season_num != actual_season_num:
                continue

            # This episode belongs to the requested season
            episodes.append(episode)

        if not episodes:
            return None

        # Fix duplicate episode codes
        episodes = self._fix_duplicate_episode_codes(episodes)

        season_type = self.classify_season_type(season_name)

        logger.debug(f"Scraped season '{season_name}': {len(episodes)} episodes found")

        return ScrapedSeason(
            name=season_name,
            season_type=season_type,
            sort_order=sort_order,
            episodes=episodes
        )

    def _scrape_season_page(self, season_name: str, season_url: str, sort_order: int) -> Optional[ScrapedSeason]:
        """
        Scrape a single season page

        Args:
            season_name: Name of the season
            season_url: URL to season page
            sort_order: Sort order number

        Returns:
            ScrapedSeason or None
        """
        try:
            response = self.session.get(season_url, timeout=15)
            response.raise_for_status()
        except Exception as e:
            print(f"    Fehler beim Laden: {e}")
            return None

        soup = BeautifulSoup(response.content, 'lxml')

        # Extract episodes from this page
        episodes = self._extract_episodes_from_page(soup)

        if not episodes:
            return None

        season_type = self.classify_season_type(season_name)

        logger.debug(f"Scraped season '{season_name}': {len(episodes)} episodes found")

        return ScrapedSeason(
            name=season_name,
            season_type=season_type,
            sort_order=sort_order,
            episodes=episodes
        )

    def _fix_duplicate_episode_codes(self, episodes: List[ScrapedEpisode]) -> List[ScrapedEpisode]:
        """
        Fix duplicate episode codes by adding letter suffixes (a, b, c, etc.)
        to ALL episodes that share the same code (including the first occurrence).

        For example, if three episodes have code "01", they become "01a", "01b", "01c".

        Args:
            episodes: List of episodes that may contain duplicates

        Returns:
            List of episodes with unique codes
        """
        from collections import Counter

        # Count how many times each code appears
        code_counts = Counter(ep.episode_code for ep in episodes)

        # Track which suffix to use for each code
        code_suffixes = {}

        # Process each episode
        for episode in episodes:
            original_code = episode.episode_code

            # If this code appears more than once, add suffix to ALL occurrences
            if code_counts[original_code] > 1:
                # Get next suffix for this code (a, b, c, ...)
                if original_code not in code_suffixes:
                    code_suffixes[original_code] = ord('a')

                suffix_char = chr(code_suffixes[original_code])
                episode.episode_code = f"{original_code}{suffix_char}"
                code_suffixes[original_code] += 1

        return episodes

    def _extract_episodes_from_page(self, soup: BeautifulSoup) -> List[ScrapedEpisode]:
        """
        Extract episodes from a season page

        fernsehserien.de uses H3 headers for episode titles within section elements
        OR section elements with itemprop="episode" (alternative layout)
        """
        episodes = []

        # Try method 1: Find all H3 elements that start with a number or "Folge X"
        all_h3 = soup.find_all('h3')
        # Accept both "41. Title" and "Folge 42" formats
        episode_h3s = [h3 for h3 in all_h3 if re.match(r'^(\d+[a-z]?\.|Folge\s+\d+)', h3.get_text(strip=True))]

        if episode_h3s:
            # Method 1: H3-based extraction (main overview page with detailed episode info)
            for h3 in episode_h3s:
                episode = self._parse_episode_h3(h3)
                if episode:
                    episodes.append(episode)
        else:
            # Method 2: Try overview page table format with <a role="row" itemprop="episode">
            # This format has episode_number in the cells
            episode_rows = soup.find_all('a', {'role': 'row', 'itemprop': 'episode'})
            if episode_rows:
                for i, row in enumerate(episode_rows, 1):
                    episode = self._parse_episode_row(row, i)
                    if episode:
                        episodes.append(episode)
            else:
                # Method 3: Try alternative layout with section[itemprop="episode"]
                # This is used on some dedicated season pages (e.g., ComedyStreet Staffel 6+)
                episode_sections = soup.find_all('section', itemprop='episode')
                for i, section in enumerate(episode_sections, 1):
                    episode = self._parse_episode_section(section, i)
                    if episode:
                        episodes.append(episode)

        # After collecting all episodes, handle duplicate episode_codes
        # This ensures ALL episodes with the same code get suffixes (a, b, c, etc.)
        episodes = self._fix_duplicate_episode_codes(episodes)

        return episodes

    def _parse_episode_h3(self, h3) -> Optional[ScrapedEpisode]:
        """
        Parse an episode from an H3 header

        Format: "1. Episode Title (Original Title)"
        The parent section contains date information in text format
        """
        title_text = h3.get_text(strip=True)

        # Extract episode_number (overall series number) from H3 text
        # Example: "111. Episode Title" -> episode_number = 111
        # Example: "Folge 42" -> episode_number = 42
        episode_number = None
        number_match = re.match(r'^(\d+)[a-z]?\.', title_text)
        if not number_match:
            # Try "Folge X" format
            number_match = re.match(r'^Folge\s+(\d+)', title_text)
        if number_match:
            try:
                episode_number = int(number_match.group(1))
            except ValueError:
                pass

        # Extract title (everything after the number or "Folge X")
        title_match = re.match(r'^\d+[a-z]?\.?\s*(.+)', title_text)
        if not title_match:
            # Try "Folge X Title" format
            title_match = re.match(r'^Folge\s+\d+\s*(.+)?', title_text)
        if title_match:
            title = title_match.group(1).strip() if title_match.group(1) else title_text
        else:
            title = title_text

        # Remove English title in parentheses if present
        # Example: "Gewöhnliche Leute(Nosedive)" -> "Gewöhnliche Leute"
        # Example: "White Christmas" -> "White Christmas" (no change if no German title)
        title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip()

        # Get the parent section that contains date information
        section = h3.find_parent('section')
        if not section:
            # Fallback: just return episode without dates
            return ScrapedEpisode(
                episode_code="00",
                title=title,
                episode_number=episode_number
            )

        # Extract episode_code (season-specific episode number) and episode_id from episode link
        # Format: /folgen/12x01-title-1828679 or /folgen/01-title-1828679 (for specials)
        episode_code = None
        episode_id = None
        episode_link = section.find('a', href=re.compile(r'/folgen/'))
        if episode_link:
            href = episode_link.get('href', '')
            # Extract episode_id (last number in URL)
            episode_id_match = re.search(r'-(\d+)$', href)
            if episode_id_match:
                episode_id = episode_id_match.group(1)

            # Try format: /folgen/12x01-... (regular episodes)
            season_episode_match = re.search(r'/folgen/(\d+)x(\d+)', href)
            if season_episode_match:
                episode_code = season_episode_match.group(2).zfill(2)
            else:
                # Try format: /folgen/01-... (specials without season prefix)
                special_match = re.search(r'/folgen/(\d+)-', href)
                if special_match:
                    episode_code = special_match.group(1).zfill(2)

        # Fallback: extract from H3 text if link extraction failed
        # Examples: "0.01 Title" -> "01", "1. Title" -> "01", "12a. Title" -> "12a"
        if not episode_code:
            # Try to find pattern like "X.YY" in title text (e.g., "0.01")
            decimal_match = re.match(r'^\d+\.(\d+[a-z]?)', title_text)
            if decimal_match:
                ep_num = decimal_match.group(1)
                if ep_num.isdigit():
                    episode_code = ep_num.zfill(2)
                else:
                    # Handle cases like "12a"
                    episode_code = ep_num[:-1].zfill(2) + ep_num[-1] if len(ep_num) >= 2 else ep_num.zfill(2)
            else:
                # Last resort: use the episode number from start of H3
                episode_code = self.extract_episode_code(title_text)

        # Extract dates from the section text
        section_text = section.get_text()
        dates = self._extract_dates_from_text(section_text)

        return ScrapedEpisode(
            episode_code=episode_code,
            title=title,
            episode_number=episode_number,
            episode_id=episode_id,
            date_de_tv=dates.get('de_tv'),
            date_de_streaming=dates.get('de_streaming'),
            date_de_home_media=dates.get('de_home_media'),
            date_de_sync=dates.get('de_sync'),
            date_original=dates.get('original')
        )

    def _parse_episode_section(self, section, fallback_number: int) -> Optional[ScrapedEpisode]:
        """
        Parse an episode from a section element with itemprop="episode"
        Used on dedicated season pages with alternative layout (e.g., ComedyStreet Staffel 1-5)

        Args:
            section: BeautifulSoup section element
            fallback_number: Episode number to use if can't extract from URL
        """
        # Extract title from itemprop="name"
        title_elem = section.find(itemprop='name')
        if not title_elem:
            return None

        title = title_elem.get_text(strip=True)

        # Remove English title in parentheses if present
        title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip()

        # Try to extract episode info from URL: href="/series/folgen/1x01-title-1828679"
        url_elem = section.find('a', itemprop='url')
        episode_code = None
        episode_number = None
        episode_id = None

        if url_elem:
            href = url_elem.get('href', '')
            # Extract episode_id (last number in URL)
            episode_id_match = re.search(r'-(\d+)$', href)
            if episode_id_match:
                episode_id = episode_id_match.group(1)

            # Pattern: /folgen/1x01-... or /folgen/SxE-...
            match = re.search(r'/folgen/(\d+)x(\d+)', href)
            if match:
                episode_code = match.group(2).zfill(2)

        # Try to extract episode_number from <div role="cell" itemprop="episodeNumber">
        # Structure on overview page: <div content="1" itemprop="episodeNumber" role="cell">01</div>
        # The content attribute contains the overall episode number
        cell_div = section.find('div', itemprop='episodeNumber')
        if cell_div:
            # Extract from content attribute
            content_attr = cell_div.get('content')
            if content_attr:
                try:
                    episode_number = int(content_attr)
                except (ValueError, TypeError):
                    pass

        # Fallback: use sequential numbering
        if not episode_code:
            episode_code = str(fallback_number).zfill(2)

        # Extract dates
        section_text = section.get_text()
        dates = self._extract_dates_from_text(section_text)

        return ScrapedEpisode(
            episode_code=episode_code,
            title=title,
            episode_number=episode_number,
            episode_id=episode_id,
            date_de_tv=dates.get('de_tv'),
            date_de_streaming=dates.get('de_streaming'),
            date_de_home_media=dates.get('de_home_media'),
            date_de_sync=dates.get('de_sync'),
            date_original=dates.get('original')
        )

    def _parse_episode_row(self, row, fallback_number: int) -> Optional[ScrapedEpisode]:
        """
        Parse an episode from a row element (<a role="row" itemprop="episode">)
        Used on overview pages with table format

        Args:
            row: BeautifulSoup <a> element with role="row"
            fallback_number: Episode number to use if can't extract
        """
        # Extract episode_id from row href (e.g., /series/folgen/1x01-title-1828679)
        episode_id = None
        href = row.get('href', '')
        if href:
            episode_id_match = re.search(r'-(\d+)$', href)
            if episode_id_match:
                episode_id = episode_id_match.group(1)

        # Get all cells in the row
        cells = row.find_all('div', role='cell')

        if len(cells) < 7:
            return None

        # Cell structure based on test output:
        # Cell 2: Contains overall episode number (before span) + season.episode (in span)
        # Cell 5: Contains episode_code with itemprop="episodeNumber"
        # Cell 7: Contains title with itemprop="name"
        # Cell 8: Contains date

        # Extract overall episode_number from cell 2
        episode_number = None
        if len(cells) >= 2:
            cell2 = cells[1]  # Index 1 = cell 2
            # Structure: <div role="cell">1<span class="episodenliste-schmal"><b>1.01</b></span></div>
            # Extract the number before the span
            cell2_text = ''
            for child in cell2.children:
                if isinstance(child, str):
                    cell2_text += child.strip()
                else:
                    # Stop at first tag (the span)
                    break

            if cell2_text:
                try:
                    episode_number = int(cell2_text)
                except ValueError:
                    pass

        # Extract episode_code from cell 5 (itemprop="episodeNumber")
        episode_code = None
        ep_code_cell = row.find('div', itemprop='episodeNumber')
        if ep_code_cell:
            code_text = ep_code_cell.get_text(strip=True)
            if code_text:
                episode_code = code_text.zfill(2)

        # Fallback for episode_code
        if not episode_code:
            episode_code = str(fallback_number).zfill(2)

        # Extract title from cell 7
        title = ""
        title_cell = row.find('div', class_='episodenliste-2019-episodentitel')
        if title_cell:
            title_elem = title_cell.find(itemprop='name')
            if title_elem:
                title = title_elem.get_text(strip=True)
                # Remove English title in parentheses if present
                title = re.sub(r'\s*\([^)]+\)\s*$', '', title).strip()

        # Extract date from cell 8 (simple date text)
        date_de_tv = None
        if len(cells) >= 8:
            date_cell = cells[7]  # Index 7 = cell 8
            date_text = date_cell.get_text(strip=True)
            if date_text:
                date_de_tv = self.parse_german_date(date_text)

        return ScrapedEpisode(
            episode_code=episode_code,
            title=title,
            episode_number=episode_number,
            episode_id=episode_id,
            date_de_tv=date_de_tv,
            date_de_streaming=None,
            date_de_home_media=None,
            date_de_sync=None,
            date_original=None
        )

    def _extract_dates_from_text(self, text: str) -> Dict[str, Optional[datetime]]:
        """
        Extract dates from plain text containing German date labels

        Expected format:
        - "Deutsche TV-Premiere Mi. 11.12.2013 RTL Crime"
        - "Deutsche Streaming-Premiere Fr. 21.10.2016 Netflix"
        - "Deutsche Home-Media-Premiere Do. 21.11.2024"
        - "Original-TV-Premiere So. 04.12.2011 Channel 4"
        - "Premiere der deutschen Synchronfassung ..."
        """
        dates = {
            'de_tv': None,
            'de_streaming': None,
            'de_home_media': None,
            'de_sync': None,
            'original': None
        }

        # Search for "Deutsche TV-Premiere" followed by a date
        match = re.search(r'Deutsche\s+TV-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
        if match:
            dates['de_tv'] = self.parse_german_date(match.group(1))

        # Search for "Deutsche Streaming-Premiere"
        match = re.search(r'Deutsche\s+Streaming-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
        if match:
            dates['de_streaming'] = self.parse_german_date(match.group(1))

        # Search for "Deutsche Home-Media-Premiere"
        match = re.search(r'Deutsche\s+Home-Media-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
        if match:
            dates['de_home_media'] = self.parse_german_date(match.group(1))

        # Search for "Premiere der deutschen Synchronfassung"
        match = re.search(r'Premiere\s+der\s+deutschen\s+Synchronfassung\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
        if match:
            dates['de_sync'] = self.parse_german_date(match.group(1))

        # Search for "Original-TV-Premiere" or "Original-Streaming-Premiere"
        match = re.search(r'Original-(?:TV|Streaming)-Premiere\s+\w+\.\s+(\d{1,2}\.\d{1,2}\.\d{2,4})', text)
        if match:
            dates['original'] = self.parse_german_date(match.group(1))

        return dates


def main():
    """Test the scraper"""
    if len(sys.argv) < 2:
        print("Usage: python fernsehserien_scraper.py <url>")
        print("Example: python fernsehserien_scraper.py https://www.fernsehserien.de/black-mirror/episodenguide")
        sys.exit(1)

    url = sys.argv[1]

    scraper = FernsehserienScraper()
    title, seasons = scraper.scrape_series(url)

    print(f"\n=== {title} ===")
    print(f"Staffeln gesamt: {len(seasons)}\n")

    for season in seasons:
        print(f"{season.name} ({season.season_type.value}): {len(season.episodes)} Episoden")
        for ep in season.episodes[:3]:  # Show first 3
            dates = []
            if ep.date_original:
                dates.append(f"Orig: {ep.date_original.strftime('%d.%m.%Y')}")
            if ep.date_de_tv:
                dates.append(f"DE: {ep.date_de_tv.strftime('%d.%m.%Y')}")

            date_str = ", ".join(dates) if dates else "Keine Daten"
            print(f"  {ep.episode_code}. {ep.title} ({date_str})")

        if len(season.episodes) > 3:
            print(f"  ... und {len(season.episodes) - 3} weitere")
        print()


if __name__ == "__main__":
    main()