Implement infobox parsing, misc bugfixes, version bump

2022-05-15 20:10:32 +02:00
parent f5c0f4658d
commit 008e6870e8
5 changed files with 149 additions and 30 deletions
--- a/itch_dl/infobox.py
+++ b/itch_dl/infobox.py
@@ -5,17 +5,43 @@ from bs4 import BeautifulSoup


 class InfoboxMetadata(TypedDict, total=False):
-    pass
+    updated_at: datetime
+    released_at: datetime
+    published_at: datetime
+    status: str
+    platforms: List[str]  # Windows/macOS/Linux/etc
+    publisher: str
+    author: Dict[str, str]  # See impl below!
+    authors: Dict[str, str]  # Links
+    genre: Dict[str, str]  # Links
+    tools: Dict[str, str]  # Links
+    license: Dict[str, str]  # Links
+    asset_license: Dict[str, str]  # Links
+    tags: Dict[str, str]  # Links
+    length: str
+    multiplayer: Dict[str, str]  # Links
+    player_count: str
+    accessibility: Dict[str, str]  # Links
+    inputs: Dict[str, str]  # Links
+    links: Dict[str, str]  # Links
+    mentions: Dict[str, str]  # Links


-def parse_date_block(td: BeautifulSoup) -> datetime:
-    raise NotImplementedError("Not yet!")
+def parse_date_block(td: BeautifulSoup) -> Optional[datetime]:
+    abbr = td.find("abbr")
+    if not abbr or 'title' not in abbr.attrs:
+        return None
+
+    date_str, time_str = abbr['title'].split('@')
+    date = datetime.strptime(date_str.strip(), "%d %B %Y")
+    time = datetime.strptime(time_str.strip(), "%H:%M")
+    return datetime(date.year, date.month, date.day, time.hour, time.minute)


 def parse_links(td: BeautifulSoup) -> Dict[str, str]:
    """Parses blocks of comma-separated <a> blocks, returns a dict
    of link text -> URL it points at."""
-    pass
+    return {link.text.strip(): link['href'] for link in td.find_all("a")}


 def parse_text_from_links(td: BeautifulSoup) -> List[str]:
@@ -24,7 +50,54 @@ def parse_text_from_links(td: BeautifulSoup) -> List[str]:

 def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:
    if name == "Updated":
-        pass
+        return "updated_at", parse_date_block(content)
+    elif name == "Release date":
+        return "released_at", parse_date_block(content)
+    elif name == "Published":
+        return "published_at", parse_date_block(content)
+    elif name == "Status":
+        return "status", parse_text_from_links(content)[0]
+    elif name == "Platforms":
+        return "platforms", parse_text_from_links(content)
+    elif name == "Publisher":
+        return "publisher", content.text.strip()
+    elif name == "Rating":
+        return None  # Read the AggregatedRating block instead!
+    elif name == "Author":
+        author, author_url = parse_links(content).popitem()
+        return "author", {"author": author, "author_url": author_url}
+    elif name == "Authors":
+        return "authors", parse_links(content)
+    elif name == "Genre":
+        return "genre", parse_links(content)
+    elif name == "Made with":
+        return "tools", parse_links(content)
+    elif name == "License":
+        return "license", parse_links(content)
+    elif name == "Asset license":
+        return "asset_license", parse_links(content)
+    elif name == "Tags":
+        return "tags", parse_links(content)
+    elif name == "Average session":
+        return "length", parse_text_from_links(content)[0]
+    elif name == "Languages":
+        return "languages", parse_links(content)
+    elif name == "Multiplayer":
+        return "multiplayer", parse_links(content)
+    elif name == "Player count":
+        return "player_count", content.text.strip()
+    elif name == "Accessibility":
+        return "accessibility", parse_links(content)
+    elif name == "Inputs":
+        return "inputs", parse_links(content)
+    elif name == "Links":
+        return "links", parse_links(content)
+    elif name == "Mentions":
+        return "mentions", parse_links(content)
+    else:
+        # Oops, you need to extend this with something new. Sorry.
+        # Make sure to add the block name to InfoboxMetadata as well!
+        raise NotImplementedError(f"Unknown infobox block name '{name}' - please file a new itch-dl issue.")


 def parse_infobox(infobox: BeautifulSoup) -> dict:
@@ -42,6 +115,7 @@ def parse_infobox(infobox: BeautifulSoup) -> dict:

        parsed_block = parse_tr(name, content_td)
        if parsed_block:
+            # noinspection PyTypedDict
            meta[parsed_block[0]] = parsed_block[1]

    return meta