Implement infobox parsing, misc bugfixes, version bump

This commit is contained in:
Ryszard Knop 2022-05-15 20:10:32 +02:00
parent f5c0f4658d
commit 008e6870e8
5 changed files with 149 additions and 30 deletions

View File

@ -5,12 +5,13 @@ Bulk download games from [itch.io](https://itch.io/).
- Can download game jams, browse pages (popular, newest, browse by tag...) and individual games. - Can download game jams, browse pages (popular, newest, browse by tag...) and individual games.
- Requires Python 3.8+, grab it from PyPI: `pip install itch-dl` - Requires Python 3.8+, grab it from PyPI: `pip install itch-dl`
- For development, use [Poetry](https://python-poetry.org/). - For development, use [Poetry](https://python-poetry.org/).
- Optionally requires wget for site mirroring.
How to use this:
## How to use
- Log into itch.io with the account you'd like to use for downloading. - Log into itch.io with the account you'd like to use for downloading.
- Generate a new API key on your user account page: https://itch.io/user/settings/api-keys - Generate [a new API key](https://itch.io/user/settings/api-keys) on your user account page.
- Check out which flags you can toggle: `itch-dl --help`
- Run the downloader: `itch-dl --api-key <KEY> https://itch.io/jam/yourjamhere` - Run the downloader: `itch-dl --api-key <KEY> https://itch.io/jam/yourjamhere`
- Wait. This is going to take a while. - Wait. This is going to take a while.
@ -21,12 +22,13 @@ game jam. The input can also be a path to a itch.io JSON file with game jam entr
a list of itch.io game URLs (not browse/jam pages!) to download. a list of itch.io game URLs (not browse/jam pages!) to download.
**It's expected that the downloader output will not be complete** - logs are stupidly verbose **It's expected that the downloader output will not be complete** - logs are stupidly verbose
and it prints a report on successful/failed downloads, so you must manually grab whatever was and it prints a report on failed downloads and external URLs (links to files that are not on
not handled for you automatically for some reason. itch.io itself, but rather on an external host like Google Drive, Dropbox, etc), so you must
manually grab whatever was not handled for you automatically.
The downloader also grabs the entry page HTML, which usually comes with controls and such. By The downloader also grabs the entry page HTML, which usually comes with controls and such. By
default, it does not download images, assets and so on, just the text - use `--mirror-web` to default, it does not download images, assets and so on, just the text - use `--mirror-web` to
try and download these as well. This requires `wget` to be available in your `PATH`. try and download these as well. This does not work very well yet, but gets the basics done.
## Game Jam Entries JSON ## Game Jam Entries JSON

View File

@ -1 +1 @@
__version__ = '0.1.0' __version__ = '0.2.0'

View File

@ -2,7 +2,7 @@ ITCH_BASE = "itch.io"
ITCH_URL = f"https://{ITCH_BASE}" ITCH_URL = f"https://{ITCH_BASE}"
ITCH_API = f"https://api.{ITCH_BASE}" ITCH_API = f"https://api.{ITCH_BASE}"
# Extracts https://user.itch.io/gamename to {'author': 'user', 'game': 'gamename'} # Extracts https://user.itch.io/name to {'author': 'user', 'game': 'name'}
ITCH_GAME_URL_REGEX = r"^https:\/\/(?P<author>[\w\d\-_]+).itch.io\/(?P<game>[\w\d\-_]+)$" ITCH_GAME_URL_REGEX = r"^https:\/\/(?P<author>[\w\d\-_]+).itch.io\/(?P<game>[\w\d\-_]+)$"
ITCH_BROWSER_TYPES = [ ITCH_BROWSER_TYPES = [

View File

@ -2,7 +2,8 @@ import os
import json import json
import re import re
import logging import logging
from typing import Tuple, List, Dict, TypedDict, Optional import urllib.parse
from typing import List, Dict, TypedDict, Optional, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
@ -13,7 +14,7 @@ from tqdm.contrib.concurrent import thread_map
from .api import ItchApiClient from .api import ItchApiClient
from .utils import ItchDownloadError, get_int_after_marker_in_json from .utils import ItchDownloadError, get_int_after_marker_in_json
from .consts import ITCH_GAME_URL_REGEX from .consts import ITCH_GAME_URL_REGEX
from .infobox import parse_infobox from .infobox import parse_infobox, InfoboxMetadata
TARGET_PATHS = { TARGET_PATHS = {
'site': 'site.html', 'site': 'site.html',
@ -25,11 +26,11 @@ TARGET_PATHS = {
class DownloadResult: class DownloadResult:
def __init__(self, url: str, success: bool, errors, external_urls: Optional[List[str]] = None): def __init__(self, url: str, success: bool, errors, external_urls: List[str]):
self.url = url self.url = url
self.success = success self.success = success
self.errors = errors self.errors = errors or []
self.external_urls = external_urls self.external_urls = external_urls or []
class GameMetadata(TypedDict, total=False): class GameMetadata(TypedDict, total=False):
@ -47,7 +48,11 @@ class GameMetadata(TypedDict, total=False):
screenshots: List[str] screenshots: List[str]
description: str description: str
rating: Dict[str, Union[float, int]]
extra: InfoboxMetadata
created_at: str created_at: str
updated_at: str
released_at: str released_at: str
published_at: str published_at: str
@ -60,7 +65,8 @@ class GameDownloader:
self.download_keys = keys self.download_keys = keys
self.client = ItchApiClient(api_key) self.client = ItchApiClient(api_key)
def get_rating_json(self, site) -> Optional[dict]: @staticmethod
def get_rating_json(site) -> Optional[dict]:
for ldjson_node in site.find_all("script", type="application/ld+json"): for ldjson_node in site.find_all("script", type="application/ld+json"):
try: try:
ldjson: dict = json.loads(ldjson_node.text.strip()) ldjson: dict = json.loads(ldjson_node.text.strip())
@ -71,7 +77,8 @@ class GameDownloader:
return None return None
def get_meta(self, site, **kwargs) -> Optional[str]: @staticmethod
def get_meta(site, **kwargs) -> Optional[str]:
"""Grabs <meta property="xyz" content="value"/> values.""" """Grabs <meta property="xyz" content="value"/> values."""
node = site.find("meta", attrs=kwargs) node = site.find("meta", attrs=kwargs)
if not node: if not node:
@ -140,8 +147,34 @@ class GameDownloader:
infobox_div = site.find("div", class_="game_info_panel_widget") infobox_div = site.find("div", class_="game_info_panel_widget")
if infobox_div: if infobox_div:
infobox = parse_infobox(infobox_div) infobox = parse_infobox(infobox_div)
for dt in ('created_at', 'updated_at', 'released_at', 'published_at'):
if dt in infobox:
# noinspection PyTypedDict
metadata[dt] = infobox[dt].isoformat()
del infobox[dt]
TODO_KEYS = ['author', 'author_url', 'created_at', 'released_at', 'published_at'] if 'author' in infobox:
metadata['author'] = infobox['author']['author']
metadata['author_url'] = infobox['author']['author_url']
del infobox['author']
if 'authors' in infobox and 'author' not in metadata:
# Some games may have multiple authors (ex. compilations).
metadata['author'] = "Multiple authors"
metadata['author_url'] = f"https://{urllib.parse.urlparse(url).netloc}"
metadata['extra'] = infobox
agg_rating = rating_json.get('aggregateRating')
if agg_rating:
try:
metadata['rating'] = {
'average': float(agg_rating['ratingValue']),
'votes': agg_rating['ratingCount']
}
except: # noqa
logging.exception("Could not extract the rating metadata...")
pass # Nope, just, don't
return metadata return metadata
@ -179,7 +212,7 @@ class GameDownloader:
def download(self, url: str, skip_downloaded: bool = True): def download(self, url: str, skip_downloaded: bool = True):
match = re.match(ITCH_GAME_URL_REGEX, url) match = re.match(ITCH_GAME_URL_REGEX, url)
if not match: if not match:
return DownloadResult(url, False, [f"Game URL is invalid: {url} - please file a new issue."]) return DownloadResult(url, False, [f"Game URL is invalid: {url} - please file a new issue."], [])
author, game = match['author'], match['game'] author, game = match['author'], match['game']
@ -192,14 +225,14 @@ class GameDownloader:
# As metadata is the final file we write, all the files # As metadata is the final file we write, all the files
# should already be downloaded at this point. # should already be downloaded at this point.
logging.info("Skipping already-downloaded game for URL: %s", url) logging.info("Skipping already-downloaded game for URL: %s", url)
return DownloadResult(url, True, [f"Game already downloaded."]) return DownloadResult(url, True, [f"Game already downloaded."], [])
try: try:
logging.info("Downloading %s", url) logging.info("Downloading %s", url)
r = self.client.get(url, append_api_key=False) r = self.client.get(url, append_api_key=False)
r.raise_for_status() r.raise_for_status()
except Exception as e: except Exception as e:
return DownloadResult(url, False, [f"Could not download the game site for {url}: {e}"]) return DownloadResult(url, False, [f"Could not download the game site for {url}: {e}"], [])
site = BeautifulSoup(r.text, features="lxml") site = BeautifulSoup(r.text, features="lxml")
try: try:
@ -207,14 +240,14 @@ class GameDownloader:
metadata = self.extract_metadata(game_id, url, site) metadata = self.extract_metadata(game_id, url, site)
title = metadata['title'] or game title = metadata['title'] or game
except ItchDownloadError as e: except ItchDownloadError as e:
return DownloadResult(url, False, [str(e)]) return DownloadResult(url, False, [str(e)], [])
credentials = self.get_credentials(title, game_id) credentials = self.get_credentials(title, game_id)
try: try:
game_uploads_req = self.client.get(f"/games/{game_id}/uploads", data=credentials, timeout=15) game_uploads_req = self.client.get(f"/games/{game_id}/uploads", data=credentials, timeout=15)
game_uploads_req.raise_for_status() game_uploads_req.raise_for_status()
except Exception as e: except Exception as e:
return DownloadResult(url, False, [f"Could not fetch game uploads for {title}: {e}"]) return DownloadResult(url, False, [f"Could not fetch game uploads for {title}: {e}"], [])
game_uploads = game_uploads_req.json()['uploads'] game_uploads = game_uploads_req.json()['uploads']
logging.debug("Found %d upload(s): %s", len(game_uploads), str(game_uploads)) logging.debug("Found %d upload(s): %s", len(game_uploads), str(game_uploads))
@ -264,17 +297,20 @@ class GameDownloader:
if len(external_urls) > 0: if len(external_urls) > 0:
logging.warning(f"Game {title} has external download URLs: {external_urls}") logging.warning(f"Game {title} has external download URLs: {external_urls}")
# TODO: Screenshots and site assets # TODO: Mirror JS/CSS assets
if self.mirror_web: if self.mirror_web:
os.makedirs(paths['screenshots'], exist_ok=True) os.makedirs(paths['screenshots'], exist_ok=True)
for screenshot in metadata['screenshots']: for screenshot in metadata['screenshots']:
if not screenshot:
continue
file_name = os.path.basename(screenshot) file_name = os.path.basename(screenshot)
try: try:
self.download_file(screenshot, os.path.join(paths['screenshots'], file_name), credentials={}) self.download_file(screenshot, os.path.join(paths['screenshots'], file_name), credentials={})
except Exception as e: except Exception as e:
errors.append(f"Screenshot download failed (this is not fatal): {e}") errors.append(f"Screenshot download failed (this is not fatal): {e}")
if 'cover_url' in metadata: if metadata.get('cover_url'):
try: try:
cover_url = metadata['cover_url'] cover_url = metadata['cover_url']
self.download_file(cover_url, paths['cover'] + os.path.splitext(cover_url)[-1], credentials={}) self.download_file(cover_url, paths['cover'] + os.path.splitext(cover_url)[-1], credentials={})
@ -285,7 +321,7 @@ class GameDownloader:
f.write(site.prettify()) f.write(site.prettify())
with open(paths['metadata'], 'w') as f: with open(paths['metadata'], 'w') as f:
json.dump(metadata, f) json.dump(metadata, f, indent=4)
if len(errors) > 0: if len(errors) > 0:
logging.error(f"Game {title} has download errors: {errors}") logging.error(f"Game {title} has download errors: {errors}")
@ -294,7 +330,14 @@ class GameDownloader:
return DownloadResult(url, len(errors) == 0, errors, external_urls) return DownloadResult(url, len(errors) == 0, errors, external_urls)
def drive_downloads(jobs: List[str], download_to: str, mirror_web: bool, api_key: str, keys: Dict[int, str], parallel: int = 1): def drive_downloads(
jobs: List[str],
download_to: str,
mirror_web: bool,
api_key: str,
keys: Dict[int, str],
parallel: int = 1
):
downloader = GameDownloader(download_to, mirror_web, api_key, keys) downloader = GameDownloader(download_to, mirror_web, api_key, keys)
tqdm_args = { tqdm_args = {
"desc": "Games", "desc": "Games",

View File

@ -5,17 +5,43 @@ from bs4 import BeautifulSoup
class InfoboxMetadata(TypedDict, total=False): class InfoboxMetadata(TypedDict, total=False):
pass updated_at: datetime
released_at: datetime
published_at: datetime
status: str
platforms: List[str] # Windows/macOS/Linux/etc
publisher: str
author: Dict[str, str] # See impl below!
authors: Dict[str, str] # Links
genre: Dict[str, str] # Links
tools: Dict[str, str] # Links
license: Dict[str, str] # Links
asset_license: Dict[str, str] # Links
tags: Dict[str, str] # Links
length: str
multiplayer: Dict[str, str] # Links
player_count: str
accessibility: Dict[str, str] # Links
inputs: Dict[str, str] # Links
links: Dict[str, str] # Links
mentions: Dict[str, str] # Links
def parse_date_block(td: BeautifulSoup) -> datetime: def parse_date_block(td: BeautifulSoup) -> Optional[datetime]:
raise NotImplementedError("Not yet!") abbr = td.find("abbr")
if not abbr or 'title' not in abbr.attrs:
return None
date_str, time_str = abbr['title'].split('@')
date = datetime.strptime(date_str.strip(), "%d %B %Y")
time = datetime.strptime(time_str.strip(), "%H:%M")
return datetime(date.year, date.month, date.day, time.hour, time.minute)
def parse_links(td: BeautifulSoup) -> Dict[str, str]: def parse_links(td: BeautifulSoup) -> Dict[str, str]:
"""Parses blocks of comma-separated <a> blocks, returns a dict """Parses blocks of comma-separated <a> blocks, returns a dict
of link text -> URL it points at.""" of link text -> URL it points at."""
pass return {link.text.strip(): link['href'] for link in td.find_all("a")}
def parse_text_from_links(td: BeautifulSoup) -> List[str]: def parse_text_from_links(td: BeautifulSoup) -> List[str]:
@ -24,7 +50,54 @@ def parse_text_from_links(td: BeautifulSoup) -> List[str]:
def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]: def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:
if name == "Updated": if name == "Updated":
pass return "updated_at", parse_date_block(content)
elif name == "Release date":
return "released_at", parse_date_block(content)
elif name == "Published":
return "published_at", parse_date_block(content)
elif name == "Status":
return "status", parse_text_from_links(content)[0]
elif name == "Platforms":
return "platforms", parse_text_from_links(content)
elif name == "Publisher":
return "publisher", content.text.strip()
elif name == "Rating":
return None # Read the AggregatedRating block instead!
elif name == "Author":
author, author_url = parse_links(content).popitem()
return "author", {"author": author, "author_url": author_url}
elif name == "Authors":
return "authors", parse_links(content)
elif name == "Genre":
return "genre", parse_links(content)
elif name == "Made with":
return "tools", parse_links(content)
elif name == "License":
return "license", parse_links(content)
elif name == "Asset license":
return "asset_license", parse_links(content)
elif name == "Tags":
return "tags", parse_links(content)
elif name == "Average session":
return "length", parse_text_from_links(content)[0]
elif name == "Languages":
return "languages", parse_links(content)
elif name == "Multiplayer":
return "multiplayer", parse_links(content)
elif name == "Player count":
return "player_count", content.text.strip()
elif name == "Accessibility":
return "accessibility", parse_links(content)
elif name == "Inputs":
return "inputs", parse_links(content)
elif name == "Links":
return "links", parse_links(content)
elif name == "Mentions":
return "mentions", parse_links(content)
else:
# Oops, you need to extend this with something new. Sorry.
# Make sure to add the block name to InfoboxMetadata as well!
raise NotImplementedError(f"Unknown infobox block name '{name}' - please file a new itch-dl issue.")
def parse_infobox(infobox: BeautifulSoup) -> dict: def parse_infobox(infobox: BeautifulSoup) -> dict:
@ -42,6 +115,7 @@ def parse_infobox(infobox: BeautifulSoup) -> dict:
parsed_block = parse_tr(name, content_td) parsed_block = parse_tr(name, content_td)
if parsed_block: if parsed_block:
# noinspection PyTypedDict
meta[parsed_block[0]] = parsed_block[1] meta[parsed_block[0]] = parsed_block[1]
return meta return meta