From 008e6870e851d1a6ed1f6a8406290940325560c2 Mon Sep 17 00:00:00 2001 From: Ryszard Knop Date: Sun, 15 May 2022 20:10:32 +0200 Subject: [PATCH] Implement infobox parsing, misc bugfixes, version bump --- README.md | 14 ++++---- itch_dl/__init__.py | 2 +- itch_dl/consts.py | 2 +- itch_dl/downloader.py | 77 ++++++++++++++++++++++++++++++--------- itch_dl/infobox.py | 84 ++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 149 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 492d022..c70ac48 100644 --- a/README.md +++ b/README.md @@ -5,12 +5,13 @@ Bulk download games from [itch.io](https://itch.io/). - Can download game jams, browse pages (popular, newest, browse by tag...) and individual games. - Requires Python 3.8+, grab it from PyPI: `pip install itch-dl` - For development, use [Poetry](https://python-poetry.org/). -- Optionally requires wget for site mirroring. -How to use this: + +## How to use - Log into itch.io with the account you'd like to use for downloading. -- Generate a new API key on your user account page: https://itch.io/user/settings/api-keys +- Generate [a new API key](https://itch.io/user/settings/api-keys) on your user account page. +- Check out which flags you can toggle: `itch-dl --help` - Run the downloader: `itch-dl --api-key https://itch.io/jam/yourjamhere` - Wait. This is going to take a while. @@ -21,12 +22,13 @@ game jam. The input can also be a path to a itch.io JSON file with game jam entr a list of itch.io game URLs (not browse/jam pages!) to download. **It's expected that the downloader output will not be complete** - logs are stupidly verbose -and it prints a report on successful/failed downloads, so you must manually grab whatever was -not handled for you automatically for some reason. +and it prints a report on failed downloads and external URLs (links to files that are not on +itch.io itself, but rather on an external host like Google Drive, Dropbox, etc), so you must +manually grab whatever was not handled for you automatically. The downloader also grabs the entry page HTML, which usually comes with controls and such. By default, it does not download images, assets and so on, just the text - use `--mirror-web` to -try and download these as well. This requires `wget` to be available in your `PATH`. +try and download these as well. This does not work very well yet, but gets the basics done. ## Game Jam Entries JSON diff --git a/itch_dl/__init__.py b/itch_dl/__init__.py index b794fd4..7fd229a 100644 --- a/itch_dl/__init__.py +++ b/itch_dl/__init__.py @@ -1 +1 @@ -__version__ = '0.1.0' +__version__ = '0.2.0' diff --git a/itch_dl/consts.py b/itch_dl/consts.py index fac9493..df07aad 100644 --- a/itch_dl/consts.py +++ b/itch_dl/consts.py @@ -2,7 +2,7 @@ ITCH_BASE = "itch.io" ITCH_URL = f"https://{ITCH_BASE}" ITCH_API = f"https://api.{ITCH_BASE}" -# Extracts https://user.itch.io/gamename to {'author': 'user', 'game': 'gamename'} +# Extracts https://user.itch.io/name to {'author': 'user', 'game': 'name'} ITCH_GAME_URL_REGEX = r"^https:\/\/(?P[\w\d\-_]+).itch.io\/(?P[\w\d\-_]+)$" ITCH_BROWSER_TYPES = [ diff --git a/itch_dl/downloader.py b/itch_dl/downloader.py index 231a1fa..9ea01bb 100644 --- a/itch_dl/downloader.py +++ b/itch_dl/downloader.py @@ -2,7 +2,8 @@ import os import json import re import logging -from typing import Tuple, List, Dict, TypedDict, Optional +import urllib.parse +from typing import List, Dict, TypedDict, Optional, Union from bs4 import BeautifulSoup from requests.exceptions import HTTPError @@ -13,7 +14,7 @@ from tqdm.contrib.concurrent import thread_map from .api import ItchApiClient from .utils import ItchDownloadError, get_int_after_marker_in_json from .consts import ITCH_GAME_URL_REGEX -from .infobox import parse_infobox +from .infobox import parse_infobox, InfoboxMetadata TARGET_PATHS = { 'site': 'site.html', @@ -25,11 +26,11 @@ TARGET_PATHS = { class DownloadResult: - def __init__(self, url: str, success: bool, errors, external_urls: Optional[List[str]] = None): + def __init__(self, url: str, success: bool, errors, external_urls: List[str]): self.url = url self.success = success - self.errors = errors - self.external_urls = external_urls + self.errors = errors or [] + self.external_urls = external_urls or [] class GameMetadata(TypedDict, total=False): @@ -47,7 +48,11 @@ class GameMetadata(TypedDict, total=False): screenshots: List[str] description: str + rating: Dict[str, Union[float, int]] + extra: InfoboxMetadata + created_at: str + updated_at: str released_at: str published_at: str @@ -60,7 +65,8 @@ class GameDownloader: self.download_keys = keys self.client = ItchApiClient(api_key) - def get_rating_json(self, site) -> Optional[dict]: + @staticmethod + def get_rating_json(site) -> Optional[dict]: for ldjson_node in site.find_all("script", type="application/ld+json"): try: ldjson: dict = json.loads(ldjson_node.text.strip()) @@ -71,7 +77,8 @@ class GameDownloader: return None - def get_meta(self, site, **kwargs) -> Optional[str]: + @staticmethod + def get_meta(site, **kwargs) -> Optional[str]: """Grabs values.""" node = site.find("meta", attrs=kwargs) if not node: @@ -140,8 +147,34 @@ class GameDownloader: infobox_div = site.find("div", class_="game_info_panel_widget") if infobox_div: infobox = parse_infobox(infobox_div) + for dt in ('created_at', 'updated_at', 'released_at', 'published_at'): + if dt in infobox: + # noinspection PyTypedDict + metadata[dt] = infobox[dt].isoformat() + del infobox[dt] - TODO_KEYS = ['author', 'author_url', 'created_at', 'released_at', 'published_at'] + if 'author' in infobox: + metadata['author'] = infobox['author']['author'] + metadata['author_url'] = infobox['author']['author_url'] + del infobox['author'] + + if 'authors' in infobox and 'author' not in metadata: + # Some games may have multiple authors (ex. compilations). + metadata['author'] = "Multiple authors" + metadata['author_url'] = f"https://{urllib.parse.urlparse(url).netloc}" + + metadata['extra'] = infobox + + agg_rating = rating_json.get('aggregateRating') + if agg_rating: + try: + metadata['rating'] = { + 'average': float(agg_rating['ratingValue']), + 'votes': agg_rating['ratingCount'] + } + except: # noqa + logging.exception("Could not extract the rating metadata...") + pass # Nope, just, don't return metadata @@ -179,7 +212,7 @@ class GameDownloader: def download(self, url: str, skip_downloaded: bool = True): match = re.match(ITCH_GAME_URL_REGEX, url) if not match: - return DownloadResult(url, False, [f"Game URL is invalid: {url} - please file a new issue."]) + return DownloadResult(url, False, [f"Game URL is invalid: {url} - please file a new issue."], []) author, game = match['author'], match['game'] @@ -192,14 +225,14 @@ class GameDownloader: # As metadata is the final file we write, all the files # should already be downloaded at this point. logging.info("Skipping already-downloaded game for URL: %s", url) - return DownloadResult(url, True, [f"Game already downloaded."]) + return DownloadResult(url, True, [f"Game already downloaded."], []) try: logging.info("Downloading %s", url) r = self.client.get(url, append_api_key=False) r.raise_for_status() except Exception as e: - return DownloadResult(url, False, [f"Could not download the game site for {url}: {e}"]) + return DownloadResult(url, False, [f"Could not download the game site for {url}: {e}"], []) site = BeautifulSoup(r.text, features="lxml") try: @@ -207,14 +240,14 @@ class GameDownloader: metadata = self.extract_metadata(game_id, url, site) title = metadata['title'] or game except ItchDownloadError as e: - return DownloadResult(url, False, [str(e)]) + return DownloadResult(url, False, [str(e)], []) credentials = self.get_credentials(title, game_id) try: game_uploads_req = self.client.get(f"/games/{game_id}/uploads", data=credentials, timeout=15) game_uploads_req.raise_for_status() except Exception as e: - return DownloadResult(url, False, [f"Could not fetch game uploads for {title}: {e}"]) + return DownloadResult(url, False, [f"Could not fetch game uploads for {title}: {e}"], []) game_uploads = game_uploads_req.json()['uploads'] logging.debug("Found %d upload(s): %s", len(game_uploads), str(game_uploads)) @@ -264,17 +297,20 @@ class GameDownloader: if len(external_urls) > 0: logging.warning(f"Game {title} has external download URLs: {external_urls}") - # TODO: Screenshots and site assets + # TODO: Mirror JS/CSS assets if self.mirror_web: os.makedirs(paths['screenshots'], exist_ok=True) for screenshot in metadata['screenshots']: + if not screenshot: + continue + file_name = os.path.basename(screenshot) try: self.download_file(screenshot, os.path.join(paths['screenshots'], file_name), credentials={}) except Exception as e: errors.append(f"Screenshot download failed (this is not fatal): {e}") - if 'cover_url' in metadata: + if metadata.get('cover_url'): try: cover_url = metadata['cover_url'] self.download_file(cover_url, paths['cover'] + os.path.splitext(cover_url)[-1], credentials={}) @@ -285,7 +321,7 @@ class GameDownloader: f.write(site.prettify()) with open(paths['metadata'], 'w') as f: - json.dump(metadata, f) + json.dump(metadata, f, indent=4) if len(errors) > 0: logging.error(f"Game {title} has download errors: {errors}") @@ -294,7 +330,14 @@ class GameDownloader: return DownloadResult(url, len(errors) == 0, errors, external_urls) -def drive_downloads(jobs: List[str], download_to: str, mirror_web: bool, api_key: str, keys: Dict[int, str], parallel: int = 1): +def drive_downloads( + jobs: List[str], + download_to: str, + mirror_web: bool, + api_key: str, + keys: Dict[int, str], + parallel: int = 1 +): downloader = GameDownloader(download_to, mirror_web, api_key, keys) tqdm_args = { "desc": "Games", diff --git a/itch_dl/infobox.py b/itch_dl/infobox.py index 157d533..92addbc 100644 --- a/itch_dl/infobox.py +++ b/itch_dl/infobox.py @@ -5,17 +5,43 @@ from bs4 import BeautifulSoup class InfoboxMetadata(TypedDict, total=False): - pass + updated_at: datetime + released_at: datetime + published_at: datetime + status: str + platforms: List[str] # Windows/macOS/Linux/etc + publisher: str + author: Dict[str, str] # See impl below! + authors: Dict[str, str] # Links + genre: Dict[str, str] # Links + tools: Dict[str, str] # Links + license: Dict[str, str] # Links + asset_license: Dict[str, str] # Links + tags: Dict[str, str] # Links + length: str + multiplayer: Dict[str, str] # Links + player_count: str + accessibility: Dict[str, str] # Links + inputs: Dict[str, str] # Links + links: Dict[str, str] # Links + mentions: Dict[str, str] # Links -def parse_date_block(td: BeautifulSoup) -> datetime: - raise NotImplementedError("Not yet!") +def parse_date_block(td: BeautifulSoup) -> Optional[datetime]: + abbr = td.find("abbr") + if not abbr or 'title' not in abbr.attrs: + return None + + date_str, time_str = abbr['title'].split('@') + date = datetime.strptime(date_str.strip(), "%d %B %Y") + time = datetime.strptime(time_str.strip(), "%H:%M") + return datetime(date.year, date.month, date.day, time.hour, time.minute) def parse_links(td: BeautifulSoup) -> Dict[str, str]: """Parses blocks of comma-separated blocks, returns a dict of link text -> URL it points at.""" - pass + return {link.text.strip(): link['href'] for link in td.find_all("a")} def parse_text_from_links(td: BeautifulSoup) -> List[str]: @@ -24,7 +50,54 @@ def parse_text_from_links(td: BeautifulSoup) -> List[str]: def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]: if name == "Updated": - pass + return "updated_at", parse_date_block(content) + elif name == "Release date": + return "released_at", parse_date_block(content) + elif name == "Published": + return "published_at", parse_date_block(content) + elif name == "Status": + return "status", parse_text_from_links(content)[0] + elif name == "Platforms": + return "platforms", parse_text_from_links(content) + elif name == "Publisher": + return "publisher", content.text.strip() + elif name == "Rating": + return None # Read the AggregatedRating block instead! + elif name == "Author": + author, author_url = parse_links(content).popitem() + return "author", {"author": author, "author_url": author_url} + elif name == "Authors": + return "authors", parse_links(content) + elif name == "Genre": + return "genre", parse_links(content) + elif name == "Made with": + return "tools", parse_links(content) + elif name == "License": + return "license", parse_links(content) + elif name == "Asset license": + return "asset_license", parse_links(content) + elif name == "Tags": + return "tags", parse_links(content) + elif name == "Average session": + return "length", parse_text_from_links(content)[0] + elif name == "Languages": + return "languages", parse_links(content) + elif name == "Multiplayer": + return "multiplayer", parse_links(content) + elif name == "Player count": + return "player_count", content.text.strip() + elif name == "Accessibility": + return "accessibility", parse_links(content) + elif name == "Inputs": + return "inputs", parse_links(content) + elif name == "Links": + return "links", parse_links(content) + elif name == "Mentions": + return "mentions", parse_links(content) + else: + # Oops, you need to extend this with something new. Sorry. + # Make sure to add the block name to InfoboxMetadata as well! + raise NotImplementedError(f"Unknown infobox block name '{name}' - please file a new itch-dl issue.") def parse_infobox(infobox: BeautifulSoup) -> dict: @@ -42,6 +115,7 @@ def parse_infobox(infobox: BeautifulSoup) -> dict: parsed_block = parse_tr(name, content_td) if parsed_block: + # noinspection PyTypedDict meta[parsed_block[0]] = parsed_block[1] return meta