2022-05-15 18:51:13 +02:00
|
|
|
from datetime import datetime
|
|
|
|
from typing import TypedDict, Dict, List, Any, Tuple, Optional
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
class InfoboxMetadata(TypedDict, total=False):
|
2022-05-15 20:10:32 +02:00
|
|
|
updated_at: datetime
|
|
|
|
released_at: datetime
|
|
|
|
published_at: datetime
|
|
|
|
status: str
|
|
|
|
platforms: List[str] # Windows/macOS/Linux/etc
|
|
|
|
publisher: str
|
|
|
|
author: Dict[str, str] # See impl below!
|
|
|
|
authors: Dict[str, str] # Links
|
|
|
|
genre: Dict[str, str] # Links
|
|
|
|
tools: Dict[str, str] # Links
|
|
|
|
license: Dict[str, str] # Links
|
|
|
|
asset_license: Dict[str, str] # Links
|
|
|
|
tags: Dict[str, str] # Links
|
|
|
|
length: str
|
|
|
|
multiplayer: Dict[str, str] # Links
|
|
|
|
player_count: str
|
|
|
|
accessibility: Dict[str, str] # Links
|
|
|
|
inputs: Dict[str, str] # Links
|
|
|
|
links: Dict[str, str] # Links
|
|
|
|
mentions: Dict[str, str] # Links
|
2022-05-30 22:40:47 +02:00
|
|
|
category: Dict[str, str] # Links
|
2022-05-15 20:10:32 +02:00
|
|
|
|
|
|
|
|
|
|
|
def parse_date_block(td: BeautifulSoup) -> Optional[datetime]:
|
|
|
|
abbr = td.find("abbr")
|
2024-03-17 01:17:19 +01:00
|
|
|
if not abbr or "title" not in abbr.attrs:
|
2022-05-15 20:10:32 +02:00
|
|
|
return None
|
|
|
|
|
2024-03-17 01:17:19 +01:00
|
|
|
date_str, time_str = abbr["title"].split("@")
|
2022-05-15 20:10:32 +02:00
|
|
|
date = datetime.strptime(date_str.strip(), "%d %B %Y")
|
2024-03-15 18:15:32 -04:00
|
|
|
time = datetime.strptime(time_str.strip(), "%H:%M UTC")
|
2022-05-15 20:10:32 +02:00
|
|
|
return datetime(date.year, date.month, date.day, time.hour, time.minute)
|
2022-05-15 18:51:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
def parse_links(td: BeautifulSoup) -> Dict[str, str]:
|
|
|
|
"""Parses blocks of comma-separated <a> blocks, returns a dict
|
|
|
|
of link text -> URL it points at."""
|
2024-03-17 01:17:19 +01:00
|
|
|
return {link.text.strip(): link["href"] for link in td.find_all("a")}
|
2022-05-15 18:51:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
def parse_text_from_links(td: BeautifulSoup) -> List[str]:
|
|
|
|
return list(parse_links(td).keys())
|
|
|
|
|
|
|
|
|
|
|
|
def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:
|
|
|
|
if name == "Updated":
|
2022-05-15 20:10:32 +02:00
|
|
|
return "updated_at", parse_date_block(content)
|
|
|
|
elif name == "Release date":
|
|
|
|
return "released_at", parse_date_block(content)
|
|
|
|
elif name == "Published":
|
|
|
|
return "published_at", parse_date_block(content)
|
|
|
|
elif name == "Status":
|
|
|
|
return "status", parse_text_from_links(content)[0]
|
|
|
|
elif name == "Platforms":
|
|
|
|
return "platforms", parse_text_from_links(content)
|
|
|
|
elif name == "Publisher":
|
|
|
|
return "publisher", content.text.strip()
|
|
|
|
elif name == "Rating":
|
|
|
|
return None # Read the AggregatedRating block instead!
|
|
|
|
elif name == "Author":
|
|
|
|
author, author_url = parse_links(content).popitem()
|
|
|
|
return "author", {"author": author, "author_url": author_url}
|
|
|
|
elif name == "Authors":
|
|
|
|
return "authors", parse_links(content)
|
|
|
|
elif name == "Genre":
|
|
|
|
return "genre", parse_links(content)
|
|
|
|
elif name == "Made with":
|
|
|
|
return "tools", parse_links(content)
|
|
|
|
elif name == "License":
|
|
|
|
return "license", parse_links(content)
|
2022-11-01 16:33:09 +01:00
|
|
|
elif name == "Code license":
|
|
|
|
return "code_license", parse_links(content)
|
2022-05-15 20:10:32 +02:00
|
|
|
elif name == "Asset license":
|
|
|
|
return "asset_license", parse_links(content)
|
|
|
|
elif name == "Tags":
|
|
|
|
return "tags", parse_links(content)
|
|
|
|
elif name == "Average session":
|
|
|
|
return "length", parse_text_from_links(content)[0]
|
|
|
|
elif name == "Languages":
|
|
|
|
return "languages", parse_links(content)
|
|
|
|
elif name == "Multiplayer":
|
|
|
|
return "multiplayer", parse_links(content)
|
|
|
|
elif name == "Player count":
|
|
|
|
return "player_count", content.text.strip()
|
|
|
|
elif name == "Accessibility":
|
|
|
|
return "accessibility", parse_links(content)
|
|
|
|
elif name == "Inputs":
|
|
|
|
return "inputs", parse_links(content)
|
|
|
|
elif name == "Links":
|
|
|
|
return "links", parse_links(content)
|
|
|
|
elif name == "Mentions":
|
|
|
|
return "mentions", parse_links(content)
|
2022-05-30 22:40:47 +02:00
|
|
|
elif name == "Category":
|
|
|
|
return "category", parse_links(content)
|
2022-05-15 20:10:32 +02:00
|
|
|
else:
|
|
|
|
# Oops, you need to extend this with something new. Sorry.
|
|
|
|
# Make sure to add the block name to InfoboxMetadata as well!
|
|
|
|
raise NotImplementedError(f"Unknown infobox block name '{name}' - please file a new itch-dl issue.")
|
2022-05-15 18:51:13 +02:00
|
|
|
|
|
|
|
|
2022-06-12 19:31:25 +02:00
|
|
|
def parse_infobox(infobox: BeautifulSoup) -> InfoboxMetadata:
|
2022-05-15 18:51:13 +02:00
|
|
|
"""Feed it <div class="game_info_panel_widget">, out goes a dict
|
|
|
|
of parsed metadata blocks."""
|
|
|
|
meta = InfoboxMetadata()
|
|
|
|
|
|
|
|
for tr in infobox.find_all("tr"):
|
|
|
|
tds = tr.find_all("td")
|
|
|
|
if len(tds) < 2:
|
|
|
|
continue
|
|
|
|
|
|
|
|
name_td, content_td = tds[0], tds[1]
|
|
|
|
name = name_td.text.strip()
|
|
|
|
|
|
|
|
parsed_block = parse_tr(name, content_td)
|
|
|
|
if parsed_block:
|
2022-06-12 19:31:25 +02:00
|
|
|
meta[parsed_block[0]] = parsed_block[1] # noqa (non-literal TypedDict keys)
|
2022-05-15 18:51:13 +02:00
|
|
|
|
|
|
|
return meta
|