1
0
forked from Mirrors/itch-dl

Implement infobox parsing, misc bugfixes, version bump

This commit is contained in:
Ryszard Knop
2022-05-15 20:10:32 +02:00
parent f5c0f4658d
commit 008e6870e8
5 changed files with 149 additions and 30 deletions

View File

@@ -5,17 +5,43 @@ from bs4 import BeautifulSoup
class InfoboxMetadata(TypedDict, total=False):
pass
updated_at: datetime
released_at: datetime
published_at: datetime
status: str
platforms: List[str] # Windows/macOS/Linux/etc
publisher: str
author: Dict[str, str] # See impl below!
authors: Dict[str, str] # Links
genre: Dict[str, str] # Links
tools: Dict[str, str] # Links
license: Dict[str, str] # Links
asset_license: Dict[str, str] # Links
tags: Dict[str, str] # Links
length: str
multiplayer: Dict[str, str] # Links
player_count: str
accessibility: Dict[str, str] # Links
inputs: Dict[str, str] # Links
links: Dict[str, str] # Links
mentions: Dict[str, str] # Links
def parse_date_block(td: BeautifulSoup) -> datetime:
raise NotImplementedError("Not yet!")
def parse_date_block(td: BeautifulSoup) -> Optional[datetime]:
abbr = td.find("abbr")
if not abbr or 'title' not in abbr.attrs:
return None
date_str, time_str = abbr['title'].split('@')
date = datetime.strptime(date_str.strip(), "%d %B %Y")
time = datetime.strptime(time_str.strip(), "%H:%M")
return datetime(date.year, date.month, date.day, time.hour, time.minute)
def parse_links(td: BeautifulSoup) -> Dict[str, str]:
"""Parses blocks of comma-separated <a> blocks, returns a dict
of link text -> URL it points at."""
pass
return {link.text.strip(): link['href'] for link in td.find_all("a")}
def parse_text_from_links(td: BeautifulSoup) -> List[str]:
@@ -24,7 +50,54 @@ def parse_text_from_links(td: BeautifulSoup) -> List[str]:
def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:
if name == "Updated":
pass
return "updated_at", parse_date_block(content)
elif name == "Release date":
return "released_at", parse_date_block(content)
elif name == "Published":
return "published_at", parse_date_block(content)
elif name == "Status":
return "status", parse_text_from_links(content)[0]
elif name == "Platforms":
return "platforms", parse_text_from_links(content)
elif name == "Publisher":
return "publisher", content.text.strip()
elif name == "Rating":
return None # Read the AggregatedRating block instead!
elif name == "Author":
author, author_url = parse_links(content).popitem()
return "author", {"author": author, "author_url": author_url}
elif name == "Authors":
return "authors", parse_links(content)
elif name == "Genre":
return "genre", parse_links(content)
elif name == "Made with":
return "tools", parse_links(content)
elif name == "License":
return "license", parse_links(content)
elif name == "Asset license":
return "asset_license", parse_links(content)
elif name == "Tags":
return "tags", parse_links(content)
elif name == "Average session":
return "length", parse_text_from_links(content)[0]
elif name == "Languages":
return "languages", parse_links(content)
elif name == "Multiplayer":
return "multiplayer", parse_links(content)
elif name == "Player count":
return "player_count", content.text.strip()
elif name == "Accessibility":
return "accessibility", parse_links(content)
elif name == "Inputs":
return "inputs", parse_links(content)
elif name == "Links":
return "links", parse_links(content)
elif name == "Mentions":
return "mentions", parse_links(content)
else:
# Oops, you need to extend this with something new. Sorry.
# Make sure to add the block name to InfoboxMetadata as well!
raise NotImplementedError(f"Unknown infobox block name '{name}' - please file a new itch-dl issue.")
def parse_infobox(infobox: BeautifulSoup) -> dict:
@@ -42,6 +115,7 @@ def parse_infobox(infobox: BeautifulSoup) -> dict:
parsed_block = parse_tr(name, content_td)
if parsed_block:
# noinspection PyTypedDict
meta[parsed_block[0]] = parsed_block[1]
return meta