itch-dl/itch_dl/infobox.py

from datetime import datetime
from typing import TypedDict, Dict, List, Any, Tuple, Optional

from bs4 import BeautifulSoup


class InfoboxMetadata(TypedDict, total=False):
    updated_at: datetime
    released_at: datetime
    published_at: datetime
    status: str
    platforms: List[str]  # Windows/macOS/Linux/etc
    publisher: str
    author: Dict[str, str]  # See impl below!
    authors: Dict[str, str]  # Links
    genre: Dict[str, str]  # Links
    tools: Dict[str, str]  # Links
    license: Dict[str, str]  # Links
    asset_license: Dict[str, str]  # Links
    tags: Dict[str, str]  # Links
    length: str
    multiplayer: Dict[str, str]  # Links
    player_count: str
    accessibility: Dict[str, str]  # Links
    inputs: Dict[str, str]  # Links
    links: Dict[str, str]  # Links
    mentions: Dict[str, str]  # Links


def parse_date_block(td: BeautifulSoup) -> Optional[datetime]:
    abbr = td.find("abbr")
    if not abbr or 'title' not in abbr.attrs:
        return None

    date_str, time_str = abbr['title'].split('@')
    date = datetime.strptime(date_str.strip(), "%d %B %Y")
    time = datetime.strptime(time_str.strip(), "%H:%M")
    return datetime(date.year, date.month, date.day, time.hour, time.minute)


def parse_links(td: BeautifulSoup) -> Dict[str, str]:
    """Parses blocks of comma-separated <a> blocks, returns a dict
    of link text -> URL it points at."""
    return {link.text.strip(): link['href'] for link in td.find_all("a")}


def parse_text_from_links(td: BeautifulSoup) -> List[str]:
    return list(parse_links(td).keys())


def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:
    if name == "Updated":
        return "updated_at", parse_date_block(content)
    elif name == "Release date":
        return "released_at", parse_date_block(content)
    elif name == "Published":
        return "published_at", parse_date_block(content)
    elif name == "Status":
        return "status", parse_text_from_links(content)[0]
    elif name == "Platforms":
        return "platforms", parse_text_from_links(content)
    elif name == "Publisher":
        return "publisher", content.text.strip()
    elif name == "Rating":
        return None  # Read the AggregatedRating block instead!
    elif name == "Author":
        author, author_url = parse_links(content).popitem()
        return "author", {"author": author, "author_url": author_url}
    elif name == "Authors":
        return "authors", parse_links(content)
    elif name == "Genre":
        return "genre", parse_links(content)
    elif name == "Made with":
        return "tools", parse_links(content)
    elif name == "License":
        return "license", parse_links(content)
    elif name == "Asset license":
        return "asset_license", parse_links(content)
    elif name == "Tags":
        return "tags", parse_links(content)
    elif name == "Average session":
        return "length", parse_text_from_links(content)[0]
    elif name == "Languages":
        return "languages", parse_links(content)
    elif name == "Multiplayer":
        return "multiplayer", parse_links(content)
    elif name == "Player count":
        return "player_count", content.text.strip()
    elif name == "Accessibility":
        return "accessibility", parse_links(content)
    elif name == "Inputs":
        return "inputs", parse_links(content)
    elif name == "Links":
        return "links", parse_links(content)
    elif name == "Mentions":
        return "mentions", parse_links(content)
    else:
        # Oops, you need to extend this with something new. Sorry.
        # Make sure to add the block name to InfoboxMetadata as well!
        raise NotImplementedError(f"Unknown infobox block name '{name}' - please file a new itch-dl issue.")


def parse_infobox(infobox: BeautifulSoup) -> dict:
    """Feed it <div class="game_info_panel_widget">, out goes a dict
    of parsed metadata blocks."""
    meta = InfoboxMetadata()

    for tr in infobox.find_all("tr"):
        tds = tr.find_all("td")
        if len(tds) < 2:
            continue

        name_td, content_td = tds[0], tds[1]
        name = name_td.text.strip()

        parsed_block = parse_tr(name, content_td)
        if parsed_block:
            # noinspection PyTypedDict
            meta[parsed_block[0]] = parsed_block[1]

    return meta
Implement screenshot/cover art downloads, initial infobox parsing 2022-05-15 18:51:13 +02:00			`from datetime import datetime`
			`from typing import TypedDict, Dict, List, Any, Tuple, Optional`

			`from bs4 import BeautifulSoup`


			`class InfoboxMetadata(TypedDict, total=False):`
Implement infobox parsing, misc bugfixes, version bump 2022-05-15 20:10:32 +02:00			`updated_at: datetime`
			`released_at: datetime`
			`published_at: datetime`
			`status: str`
			`platforms: List[str] # Windows/macOS/Linux/etc`
			`publisher: str`
			`author: Dict[str, str] # See impl below!`
			`authors: Dict[str, str] # Links`
			`genre: Dict[str, str] # Links`
			`tools: Dict[str, str] # Links`
			`license: Dict[str, str] # Links`
			`asset_license: Dict[str, str] # Links`
			`tags: Dict[str, str] # Links`
			`length: str`
			`multiplayer: Dict[str, str] # Links`
			`player_count: str`
			`accessibility: Dict[str, str] # Links`
			`inputs: Dict[str, str] # Links`
			`links: Dict[str, str] # Links`
			`mentions: Dict[str, str] # Links`


			`def parse_date_block(td: BeautifulSoup) -> Optional[datetime]:`
			`abbr = td.find("abbr")`
			`if not abbr or 'title' not in abbr.attrs:`
			`return None`

			`date_str, time_str = abbr['title'].split('@')`
			`date = datetime.strptime(date_str.strip(), "%d %B %Y")`
			`time = datetime.strptime(time_str.strip(), "%H:%M")`
			`return datetime(date.year, date.month, date.day, time.hour, time.minute)`
Implement screenshot/cover art downloads, initial infobox parsing 2022-05-15 18:51:13 +02:00

			`def parse_links(td: BeautifulSoup) -> Dict[str, str]:`
			`"""Parses blocks of comma-separated <a> blocks, returns a dict`
			`of link text -> URL it points at."""`
Implement infobox parsing, misc bugfixes, version bump 2022-05-15 20:10:32 +02:00			`return {link.text.strip(): link['href'] for link in td.find_all("a")}`
Implement screenshot/cover art downloads, initial infobox parsing 2022-05-15 18:51:13 +02:00

			`def parse_text_from_links(td: BeautifulSoup) -> List[str]:`
			`return list(parse_links(td).keys())`


			`def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:`
			`if name == "Updated":`
Implement infobox parsing, misc bugfixes, version bump 2022-05-15 20:10:32 +02:00			`return "updated_at", parse_date_block(content)`
			`elif name == "Release date":`
			`return "released_at", parse_date_block(content)`
			`elif name == "Published":`
			`return "published_at", parse_date_block(content)`
			`elif name == "Status":`
			`return "status", parse_text_from_links(content)[0]`
			`elif name == "Platforms":`
			`return "platforms", parse_text_from_links(content)`
			`elif name == "Publisher":`
			`return "publisher", content.text.strip()`
			`elif name == "Rating":`
			`return None # Read the AggregatedRating block instead!`
			`elif name == "Author":`
			`author, author_url = parse_links(content).popitem()`
			`return "author", {"author": author, "author_url": author_url}`
			`elif name == "Authors":`
			`return "authors", parse_links(content)`
			`elif name == "Genre":`
			`return "genre", parse_links(content)`
			`elif name == "Made with":`
			`return "tools", parse_links(content)`
			`elif name == "License":`
			`return "license", parse_links(content)`
			`elif name == "Asset license":`
			`return "asset_license", parse_links(content)`
			`elif name == "Tags":`
			`return "tags", parse_links(content)`
			`elif name == "Average session":`
			`return "length", parse_text_from_links(content)[0]`
			`elif name == "Languages":`
			`return "languages", parse_links(content)`
			`elif name == "Multiplayer":`
			`return "multiplayer", parse_links(content)`
			`elif name == "Player count":`
			`return "player_count", content.text.strip()`
			`elif name == "Accessibility":`
			`return "accessibility", parse_links(content)`
			`elif name == "Inputs":`
			`return "inputs", parse_links(content)`
			`elif name == "Links":`
			`return "links", parse_links(content)`
			`elif name == "Mentions":`
			`return "mentions", parse_links(content)`
			`else:`
			`# Oops, you need to extend this with something new. Sorry.`
			`# Make sure to add the block name to InfoboxMetadata as well!`
			`raise NotImplementedError(f"Unknown infobox block name '{name}' - please file a new itch-dl issue.")`
Implement screenshot/cover art downloads, initial infobox parsing 2022-05-15 18:51:13 +02:00

			`def parse_infobox(infobox: BeautifulSoup) -> dict:`
			`"""Feed it <div class="game_info_panel_widget">, out goes a dict`
			`of parsed metadata blocks."""`
			`meta = InfoboxMetadata()`

			`for tr in infobox.find_all("tr"):`
			`tds = tr.find_all("td")`
			`if len(tds) < 2:`
			`continue`

			`name_td, content_td = tds[0], tds[1]`
			`name = name_td.text.strip()`

			`parsed_block = parse_tr(name, content_td)`
			`if parsed_block:`
Implement infobox parsing, misc bugfixes, version bump 2022-05-15 20:10:32 +02:00			`# noinspection PyTypedDict`
Implement screenshot/cover art downloads, initial infobox parsing 2022-05-15 18:51:13 +02:00			`meta[parsed_block[0]] = parsed_block[1]`

			`return meta`