itch-dl/itch_dl/infobox.py
Ryszard Knop 816a4d7399 Enable lots of extra Ruff checks
Warns about various small code smells and odd issues we can catch early.
Nothing here should change the program behavior directly.
2025-01-31 23:40:40 +01:00

126 lines
4.5 KiB
Python

from datetime import datetime
from typing import TypedDict, Dict, List, Any, Tuple, Optional
from bs4 import BeautifulSoup
class InfoboxMetadata(TypedDict, total=False):
updated_at: datetime
released_at: datetime
published_at: datetime
status: str
platforms: List[str] # Windows/macOS/Linux/etc
publisher: str
author: Dict[str, str] # See impl below!
authors: Dict[str, str] # Links
genre: Dict[str, str] # Links
tools: Dict[str, str] # Links
license: Dict[str, str] # Links
asset_license: Dict[str, str] # Links
tags: Dict[str, str] # Links
length: str
multiplayer: Dict[str, str] # Links
player_count: str
accessibility: Dict[str, str] # Links
inputs: Dict[str, str] # Links
links: Dict[str, str] # Links
mentions: Dict[str, str] # Links
category: Dict[str, str] # Links
def parse_date_block(td: BeautifulSoup) -> Optional[datetime]:
abbr = td.find("abbr")
if not abbr or "title" not in abbr.attrs:
return None
date_str, time_str = abbr["title"].split("@")
date = datetime.strptime(date_str.strip(), "%d %B %Y")
time = datetime.strptime(time_str.strip(), "%H:%M UTC")
return datetime(date.year, date.month, date.day, time.hour, time.minute)
def parse_links(td: BeautifulSoup) -> Dict[str, str]:
"""Parses blocks of comma-separated <a> blocks, returns a dict
of link text -> URL it points at."""
return {link.text.strip(): link["href"] for link in td.find_all("a")}
def parse_text_from_links(td: BeautifulSoup) -> List[str]:
return list(parse_links(td).keys())
def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:
if name == "Updated":
return "updated_at", parse_date_block(content)
elif name == "Release date":
return "released_at", parse_date_block(content)
elif name == "Published":
return "published_at", parse_date_block(content)
elif name == "Status":
return "status", parse_text_from_links(content)[0]
elif name == "Platforms":
return "platforms", parse_text_from_links(content)
elif name == "Publisher":
return "publisher", content.text.strip()
elif name == "Rating":
return None # Read the AggregatedRating block instead!
elif name == "Author":
author, author_url = parse_links(content).popitem()
return "author", {"author": author, "author_url": author_url}
elif name == "Authors":
return "authors", parse_links(content)
elif name == "Genre":
return "genre", parse_links(content)
elif name == "Made with":
return "tools", parse_links(content)
elif name == "License":
return "license", parse_links(content)
elif name == "Code license":
return "code_license", parse_links(content)
elif name == "Asset license":
return "asset_license", parse_links(content)
elif name == "Tags":
return "tags", parse_links(content)
elif name == "Average session":
return "length", parse_text_from_links(content)[0]
elif name == "Languages":
return "languages", parse_links(content)
elif name == "Multiplayer":
return "multiplayer", parse_links(content)
elif name == "Player count":
return "player_count", content.text.strip()
elif name == "Accessibility":
return "accessibility", parse_links(content)
elif name == "Inputs":
return "inputs", parse_links(content)
elif name == "Links":
return "links", parse_links(content)
elif name == "Mentions":
return "mentions", parse_links(content)
elif name == "Category":
return "category", parse_links(content)
else:
# Oops, you need to extend this with something new. Sorry.
# Make sure to add the block name to InfoboxMetadata as well!
raise NotImplementedError(f"Unknown infobox block name '{name}' - please file a new itch-dl issue.")
def parse_infobox(infobox: BeautifulSoup) -> InfoboxMetadata:
"""Feed it <div class="game_info_panel_widget">, out goes a dict
of parsed metadata blocks."""
meta = InfoboxMetadata()
for tr in infobox.find_all("tr"):
tds = tr.find_all("td")
if len(tds) < 2:
continue
name_td, content_td = tds[0], tds[1]
name = name_td.text.strip()
parsed_block = parse_tr(name, content_td)
if parsed_block:
meta[parsed_block[0]] = parsed_block[1] # noqa: PyTypedDict (non-literal TypedDict keys)
return meta