From 4a8f88b48e2bf223152546369b9c254bdb4672a8 Mon Sep 17 00:00:00 2001 From: Ryszard Knop Date: Sun, 15 May 2022 02:02:45 +0200 Subject: [PATCH] Trial The Third: Start rewriting the thing Wooo, someone wants to use this! Let's make it less embarrassing. --- .idea/itch-dl.iml | 2 +- LICENSE | 2 +- README.md | 45 +++--- downloader.py | 349 ------------------------------------------ itch_dl/__init__.py | 1 + itch_dl/__main__.py | 3 + itch_dl/api.py | 43 ++++++ itch_dl/cli.py | 67 ++++++++ itch_dl/consts.py | 29 ++++ itch_dl/downloader.py | 251 ++++++++++++++++++++++++++++++ itch_dl/handlers.py | 218 ++++++++++++++++++++++++++ itch_dl/keys.py | 31 ++++ pyproject.toml | 14 +- 13 files changed, 676 insertions(+), 379 deletions(-) delete mode 100755 downloader.py create mode 100644 itch_dl/__init__.py create mode 100644 itch_dl/__main__.py create mode 100644 itch_dl/api.py create mode 100644 itch_dl/cli.py create mode 100644 itch_dl/consts.py create mode 100644 itch_dl/downloader.py create mode 100644 itch_dl/handlers.py create mode 100644 itch_dl/keys.py diff --git a/.idea/itch-dl.iml b/.idea/itch-dl.iml index 6fb469e..1ce8f6f 100644 --- a/.idea/itch-dl.iml +++ b/.idea/itch-dl.iml @@ -4,7 +4,7 @@ - + \ No newline at end of file diff --git a/LICENSE b/LICENSE index 9454ea4..0ebb4a8 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 Dragoon Aethis +Copyright (c) 2022 Dragoon Aethis Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 2a7e6c2..492d022 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,42 @@ # itch-dl -Bulk download games from [itch.io](https://itch.io/). Currently only supports downloading game jams. +Bulk download games from [itch.io](https://itch.io/). -What you'll need: - -- Python 3.8+ -- `pip install -r requirements.txt` -- For site mirroring, [wget](https://www.gnu.org/software/wget/) in your PATH. - -On Arch, `pacman -S wget python python-requests python-slugify` works. +- Can download game jams, browse pages (popular, newest, browse by tag...) and individual games. +- Requires Python 3.8+, grab it from PyPI: `pip install itch-dl` +- For development, use [Poetry](https://python-poetry.org/). +- Optionally requires wget for site mirroring. How to use this: - Log into itch.io with the account you'd like to use for downloading. - Generate a new API key on your user account page: https://itch.io/user/settings/api-keys -- Run the downloader: `python downloader.py --api-key https://itch.io/jam/yourjamhere` +- Run the downloader: `itch-dl --api-key https://itch.io/jam/yourjamhere` - Wait. This is going to take a while. The downloader is able to grab more or less everything you can download via the itch app. -It's expected that the downloader output will not be complete - logs are stupidly verbose and -it prints a report on successful/failed downloads, so you must manually grab whatever was not -handled for you automatically for some reason. +The input URL can be any "Browse" page (top, popular, newest, filtered by tags, etc) or any +game jam. The input can also be a path to a itch.io JSON file with game jam entries, or just +a list of itch.io game URLs (not browse/jam pages!) to download. -The downloader also grabs the entry page HTML, which usually comes with controls and such. It -does not download images, external assets and so on, just the text - if the Itch page dies, -so will most elements on those downloaded pages. Controls should survive, though. +**It's expected that the downloader output will not be complete** - logs are stupidly verbose +and it prints a report on successful/failed downloads, so you must manually grab whatever was +not handled for you automatically for some reason. -(There's a pedantic site mirroring toggle in the script, if you know what you're doing. You will -need wget for that.) +The downloader also grabs the entry page HTML, which usually comes with controls and such. By +default, it does not download images, assets and so on, just the text - use `--mirror-web` to +try and download these as well. This requires `wget` to be available in your `PATH`. -## Cannot extract IDs? +## Game Jam Entries JSON -Downloader can parse and download games from a game jam entries JSON file if you want to provide it. -(The script basically automates the steps below, so if it's not able to do the same, please create -an issue!) +Downloader can parse and download games from a game jam entries JSON file if you need it. +(The script basically automates the steps below, so if it's not able to do the same, please +create an issue!) - Go to your jam's page, ex. https://itch.io/jam/gbcompo21 and right-click -> View Source. - Ctrl-F for `"id":` - it should find that text once, followed by a number. Write it down. -- (It you found it multiple times, grab the one after ViewJam something something.) -- Download https://itch.io/jam/NUMBER/entries.json (replacing NUMBER with what you wrote down) +- (It you found it multiple times, grab the one after I.ViewJam something something.) +- Download https://itch.io/jam/ID/entries.json (replacing ID with what you wrote down). +- Feed that to `itch-dl`! diff --git a/downloader.py b/downloader.py deleted file mode 100755 index 7b41acb..0000000 --- a/downloader.py +++ /dev/null @@ -1,349 +0,0 @@ -#!/usr/bin/env python3 -# Python 3.8+ and dependencies listed below required. -import os -import re -import sys -import json -import time -import shutil -import hashlib -import argparse -import traceback -import subprocess -from enum import Enum - -import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry - -from slugify import slugify - -WGET_PATH = shutil.which("wget") -if WGET_PATH is None: - print(f"Warning: wget not available, site mirroring will not work!") - -# Try to download all site assets, images etc included. -# You probably don't want this, but here you go! -PEDANTIC_MIRRORING = False - -ITCH_API = "https://api.itch.io" - - -class ItchDownloadResult(Enum): - SUCCESS = 0 - FAILURE = 1 - MISSING_DOWNLOAD = 2 - DOWNLOAD_TIMEOUT = 3 - - -class ItchDownloadError(Exception): - pass - - -class ItchApiClient(): - def __init__(self, base_url: str, api_key: str): - self.base_url = base_url - self.api_key = api_key - - self.requests = requests.Session() - - retry_strategy = Retry( - total=5, - backoff_factor=10, - allowed_methods=["HEAD", "GET"], - status_forcelist=[429, 500, 502, 503, 504] - ) - - # No timeouts - set them explicitly on API calls below! - adapter = HTTPAdapter(max_retries=retry_strategy) - self.requests.mount("https://", adapter) - self.requests.mount("http://", adapter) - - def add_api_key(self, kwargs): - # Adds the API key to request params, if one was not - # already provided outside of the client. - if 'data' in kwargs: - params = kwargs['data'] - else: - params = {} - kwargs['data'] = params - - if 'api_key' not in params: - params['api_key'] = self.api_key - - def get(self, endpoint: str, *args, **kwargs): - self.add_api_key(kwargs) - return self.requests.get(self.base_url + endpoint, *args, **kwargs) - - -def download_file(client: ItchApiClient, upload_id: int, download_path: str, creds: dict, print_url: bool=False): - # No timeouts, chunked uploads, default retry strategy, should be all good? - try: - with client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r: - r.raise_for_status() - if print_url: - print(f"Download URL: {r.url}") - - with open(download_path, 'wb') as f: - for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks - f.write(chunk) - except requests.exceptions.HTTPError as e: - raise ItchDownloadError(f"Unrecoverable download error: {e}") - - -def get_download_keys(client: ItchApiClient): - print("Fetching all download keys...") - download_keys = {} - page = 1 - - while True: - print(f"Downloading page {page}...") - try: - r = client.get("/profile/owned-keys", data={"page": page}, timeout=15) - r.raise_for_status() - except Exception as e: - print(f"Got error while fetching download keys: {e}") - print(f"Let's just pretend this is enough and move on...") - break - - data = r.json() - if 'owned_keys' not in data: - break # Assuming we're out of keys already... - - for key in data['owned_keys']: - download_keys[key['game_id']] = key['id'] - - if len(data['owned_keys']) == data['per_page']: - page += 1 - else: - break - - print(f"Fetched {len(download_keys)} download keys.") - return download_keys - - -def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]: - if 'jam_games' not in jam_json: - raise Exception("Provided JSON is not a valid itch.io jam JSON.") - - # Extract (id, url) pairs from all the entries. - return [(int(e['game']['id']), e['game']['title'], e['game']['url']) for e in jam_json['jam_games']] - - -def get_game_jam_json(jam_path: str) -> dict: - # Do we have an URL? - jam_path = jam_path.strip() - if jam_path.startswith("https://") or jam_path.startswith("http://"): - r = requests.get(jam_path) - if not r.ok: - raise Exception(f"Could not download game jam site from {jam_path} (code {r.status_code}): {r.reason}") - - jam_id_line = None - for line in r.text.splitlines(): - if "ViewJam" in line: - jam_id_line = line - - if jam_id_line is None: - raise Exception(f"Jam site did not contain the ID line - please provide the path to the game jam entries JSON file instead.") - - found_ids = re.findall(r'\"id\":([0-9]+)', jam_id_line) - if len(found_ids) == 0: - raise Exception(f"Could not extract the jam ID from the provided site.") - - jam_id = int(found_ids[0]) # Always grab the first one for now... - print(f"Extracted jam ID: {jam_id}") - - r = requests.get(f"https://itch.io/jam/{jam_id}/entries.json") - if not r.ok: - raise Exception(f"Could not download the game jam entries list.") - - content = r.text - elif os.path.isfile(jam_path): - try: - with open(jam_path) as f: - content = f.read() - except Exception as e: - raise Exception(f"Could not open/read the game jam entries file: {e}") - else: - raise Exception(f"Provided game jam path is invalid (not a link/existing file).") - - try: - jam_json = json.loads(content) - except json.decoder.JSONDecodeError: - print(f"Provided game jam entries file is not a valid JSON file.") - - return jam_json - - -def download_jam(jam_path: str, download_to: str, api_key: str, continue_from: str=None): - client = ItchApiClient(ITCH_API, api_key) - jam_json = get_game_jam_json(jam_path) - - # Check API key validity: - profile_req = client.get("/profile") - if not profile_req.ok: - print(f"Provided API key appears to be invalid: {profile_req.text}") - exit(1) - - jobs = parse_jobs(jam_json) - jobs_successful = [] - jobs_failed = [] - - download_keys = get_download_keys(client) - game_id_to_meta = {} # dict[game_id: int, (title: str, url: str)] - - for game_id, title, url in jobs: - game_id_to_meta[game_id] = (title, url) - - failed_game_ids = set() - - # No "continue from"? Yep, start right away. - should_process_jobs = continue_from is None - - for game_id, title, url in jobs: - label = f"{title} ({game_id})" - if not should_process_jobs: - if game_id == continue_from: - should_process_jobs = True - else: - continue - - try: - download_path = os.path.join(download_to, slugify(title)) - if PEDANTIC_MIRRORING: - site_mirror_path = os.path.join(download_to, "_sites") - else: - site_mirror_path = os.path.join(download_path, "site") - os.makedirs(download_path, exist_ok=True) - os.makedirs(site_mirror_path, exist_ok=True) - except: - raise ItchDownloadError(f"Could not create download directory: {download_path}") - - print(f"Trying to download {label} to {download_path}") - - if WGET_PATH is not None: - print("Downloading site...") - if PEDANTIC_MIRRORING: - extra_wget_args = [ - "--timestamping", - "--span-hosts", - "--convert-links", - "--adjust-extension", - "--page-requisites", - ] - else: - extra_wget_args = [] - - wget = subprocess.run([ - WGET_PATH, - *extra_wget_args, - "--quiet", - url - ], cwd=site_mirror_path) - - if wget.returncode != 0: - print(f"Warning: Site mirroring failed/incomplete.") - - creds = {} - if game_id in download_keys: - creds['download_key_id'] = download_keys[game_id] - print("Using {creds} for private uploads") - - game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15) - if not game_uploads_req.ok: - raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}") - - game_uploads = game_uploads_req.json()['uploads'] - print(f"Found {len(game_uploads)} upload(s)") - - try: - for upload in game_uploads: - upload_id = upload['id'] - file_name = upload['filename'] - file_size = upload['size'] - upload_is_external = upload['storage'] == 'external' - - print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...") - if upload_is_external: - print("***********************************************************") - print("* *") - print("* WARNING: External storage - downloads will likely fail. *") - print("* Check the URL displayed below manually! *") - print("* *") - print("***********************************************************") - - target_path = os.path.join(download_path, file_name) - try: - download_file(client, upload_id, target_path, creds, print_url=upload_is_external) - except ItchDownloadError as e: - jobs_failed.append((game_id, file_name, str(e))) - print(f"Download failed for {file_name}: {e}") - continue - - try: - actual_file_size = os.stat(target_path).st_size - if actual_file_size == file_size: - jobs_successful.append((game_id, file_name)) - else: - jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}")) - except FileNotFoundError: - jobs_failed.append((game_id, file_name, "Could not download file")) - - print(f"Done downloading {label}") - except ItchDownloadError as e: - failed_game_ids.append((game_id, str(e))) - print(message) - continue - except Exception as e: - print(f"Critical error while downloading {label}: {e}") - failed_game_ids.append((game_id, str(e))) - traceback.print_exc() - print(message) - continue - - successful_titles = {} - for game_id, file_name in jobs_successful: - if game_id not in successful_titles: - successful_titles[game_id] = [file_name] - - if any(successful_titles): - print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):") - for game_id, files in successful_titles.items(): - print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)") - - if any(jobs_failed): - print(f"\nDownloads failed for {len(jobs_failed)} file(s):") - for game_id, file_name, message in jobs_failed: - title, url = game_id_to_meta[game_id] - print(f"{title} - {file_name} - {message}") - print(f"Title URL: {url}") - - if any(failed_game_ids): - print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:") - for game_id, message in failed_game_ids: - title, url = game_id_to_meta[game_id] - print(f"{title} ({game_id}) - {url} - {message}") - - -def get_parser(): - parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.") - parser.add_argument("entries", help="path to the game jam entries.json file") - parser.add_argument("--api-key", metavar="key", required=True, help="itch.io API key from https://itch.io/user/settings/api-keys") - parser.add_argument("--download-to", metavar="path", help="directory to save results into (default: current dir)") - parser.add_argument("--continue-from", metavar="id", type=int, help="skip all entries until the provided entry ID is found") - return parser - - -def get_download_dir(args: argparse.Namespace) -> str: - download_to = os.getcwd() - if args.download_to is not None: - download_to = os.path.normpath(args.download_to) - os.makedirs(download_to) - - return download_to - - -if __name__ == "__main__": - args = get_parser().parse_args() - download_to = get_download_dir(args) - download_jam(args.entries, download_to, args.api_key, continue_from=args.continue_from) diff --git a/itch_dl/__init__.py b/itch_dl/__init__.py new file mode 100644 index 0000000..b794fd4 --- /dev/null +++ b/itch_dl/__init__.py @@ -0,0 +1 @@ +__version__ = '0.1.0' diff --git a/itch_dl/__main__.py b/itch_dl/__main__.py new file mode 100644 index 0000000..e0f1582 --- /dev/null +++ b/itch_dl/__main__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python3 +from itch_dl.cli import run +run() diff --git a/itch_dl/api.py b/itch_dl/api.py new file mode 100644 index 0000000..21be0f2 --- /dev/null +++ b/itch_dl/api.py @@ -0,0 +1,43 @@ +from typing import Optional + +from requests import Session +from urllib3.util.retry import Retry +from requests.adapters import HTTPAdapter + +from .consts import ITCH_API + + +class ItchApiClient: + def __init__(self, api_key: str, base_url: Optional[str] = None): + self.base_url = base_url or ITCH_API + self.api_key = api_key + + self.requests = Session() + + retry_strategy = Retry( + total=5, + backoff_factor=10, + allowed_methods=["HEAD", "GET"], + status_forcelist=[429, 500, 502, 503, 504] + ) + + # No timeouts - set them explicitly on API calls below! + adapter = HTTPAdapter(max_retries=retry_strategy) + self.requests.mount("https://", adapter) + self.requests.mount("http://", adapter) + + def get(self, endpoint: str, append_api_key: bool = True, **kwargs): + if append_api_key: + params = kwargs.get('data') or {} + + if 'api_key' not in params: + params['api_key'] = self.api_key + + kwargs['data'] = params + + if endpoint.startswith("https://"): + url = endpoint + else: + url = self.base_url + endpoint + + return self.requests.get(url, **kwargs) diff --git a/itch_dl/cli.py b/itch_dl/cli.py new file mode 100644 index 0000000..8a1b1ef --- /dev/null +++ b/itch_dl/cli.py @@ -0,0 +1,67 @@ +import os +import logging +import argparse + +from .handlers import get_jobs_for_url_or_path +from .downloader import drive_downloads +from .keys import get_download_keys +from .api import ItchApiClient +logging.basicConfig() +logging.getLogger().setLevel(logging.INFO) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Bulk download stuff from Itch.io.") + parser.add_argument("url_or_path", + help="itch.io URL or path to a game jam entries.json file") + parser.add_argument("--api-key", metavar="key", required=True, + help="itch.io API key - https://itch.io/user/settings/api-keys") + parser.add_argument("--urls-only", action="store_true", + help="print scraped game URLs without downloading them") + parser.add_argument("--download-to", metavar="path", + help="directory to save results into (default: current dir)") + parser.add_argument("--parallel", metavar="parallel", type=int, default=1, + help="how many threads to use for downloading games (default: 1)") + parser.add_argument("--mirror-web", action="store_true", + help="try to fetch assets on game sites") + parser.add_argument("--verbose", action="store_true", + help="print verbose logs") + return parser.parse_args() + + +def run() -> int: + args = parse_args() + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + jobs = get_jobs_for_url_or_path(args.url_or_path, args.api_key) + jobs = list(set(jobs)) # Deduplicate, just in case... + logging.info(f"Found {len(jobs)} URL(s).") + + if len(jobs) == 0: + print("No URLs to download.") + return 1 + + if args.urls_only: + for job in jobs: + print(job) + + return 0 + + download_to = os.getcwd() + if args.download_to is not None: + download_to = os.path.normpath(args.download_to) + os.makedirs(download_to, exist_ok=True) + + client = ItchApiClient(args.api_key) + + # Check API key validity: + profile_req = client.get("/profile") + if not profile_req.ok: + print(f"Provided API key appears to be invalid: {profile_req.text}") + exit(1) + + # Grab all the download keys (there's no way to fetch them per title...): + keys = get_download_keys(client) + + return drive_downloads(jobs, download_to, args.api_key, keys, parallel=args.parallel) diff --git a/itch_dl/consts.py b/itch_dl/consts.py new file mode 100644 index 0000000..83d8ac8 --- /dev/null +++ b/itch_dl/consts.py @@ -0,0 +1,29 @@ +from enum import Enum + +ITCH_BASE = "itch.io" +ITCH_URL = f"https://{ITCH_BASE}" +ITCH_API = f"https://api.{ITCH_BASE}" + +ITCH_BROWSER_TYPES = [ + "games", + "tools", + "game-assets", + "comics", + "books", + "physical-games", + "soundtracks", + "game-mods", + "misc", +] + + +class ItchDownloadResult(Enum): + SUCCESS = 0 + FAILURE = 1 + MISSING_DOWNLOAD = 2 + DOWNLOAD_TIMEOUT = 3 + + +# I mean, not really a const but eh +class ItchDownloadError(Exception): + pass diff --git a/itch_dl/downloader.py b/itch_dl/downloader.py new file mode 100644 index 0000000..79e5c95 --- /dev/null +++ b/itch_dl/downloader.py @@ -0,0 +1,251 @@ +import os +import shutil +import logging +import traceback +import subprocess +from typing import Tuple, List, Dict, TypedDict, Optional + +from slugify import slugify +from requests.exceptions import HTTPError + +from tqdm import tqdm +from tqdm.contrib.concurrent import thread_map + +from .api import ItchApiClient +from .consts import ItchDownloadError, ItchDownloadResult + + +# ------------------------------ +# --- OLD STUFF --- CUT HERE --- +# ------------------------------ + + +WGET_PATH = shutil.which("wget") +if WGET_PATH is None: + print(f"Warning: wget not available, site mirroring will not work!") + + +def download_file(client: ItchApiClient, upload_id: int, download_path: str, creds: dict, print_url: bool=False): + # No timeouts, chunked uploads, default retry strategy, should be all good? + try: + with client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r: + r.raise_for_status() + if print_url: + print(f"Download URL: {r.url}") + + with open(download_path, 'wb') as f: + for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks + f.write(chunk) + except HTTPError as e: + raise ItchDownloadError(f"Unrecoverable download error: {e}") + + +def get_meta_for_game_url(game_url: str) -> Tuple[int, str]: + """Finds the Game ID and Title for a Game URL.""" + data_url = game_url.rstrip("/") + "/data.json" + data_req = requests.get(data_url) + r.raise_for_status() + + data_json = data_req.json() + if not 'id' in data_json: + raise ItchDownloadError(f"Cannot fetch the Game ID for URL: {game_url}") + + return data_json['id'] + + + + + + + +def download_jam(jam_path: str, download_to: str, api_key: str, continue_from: str=None): + client = ItchApiClient(api_key) + jam_json = get_game_jam_json(jam_path) + + # Check API key validity: + profile_req = client.get("/profile") + if not profile_req.ok: + print(f"Provided API key appears to be invalid: {profile_req.text}") + exit(1) + + jobs = parse_jobs(jam_json) + jobs_successful = [] + jobs_failed = [] + + game_id_to_meta = {} # dict[game_id: int, (title: str, url: str)] + + for game_id, title, url in jobs: + game_id_to_meta[game_id] = (title, url) + + failed_game_ids = set() + + # No "continue from"? Yep, start right away. + should_process_jobs = continue_from is None + + for game_id, title, url in jobs: + label = f"{title} ({game_id})" + if not should_process_jobs: + if game_id == continue_from: + should_process_jobs = True + else: + continue + + try: + download_path = os.path.join(download_to, slugify(title)) + if PEDANTIC_MIRRORING: + site_mirror_path = os.path.join(download_to, "_sites") + else: + site_mirror_path = os.path.join(download_path, "site") + os.makedirs(download_path, exist_ok=True) + os.makedirs(site_mirror_path, exist_ok=True) + except: + raise ItchDownloadError(f"Could not create download directory: {download_path}") + + print(f"Trying to download {label} to {download_path}") + + if WGET_PATH is not None: + print("Downloading site...") + if PEDANTIC_MIRRORING: + extra_wget_args = [ + "--timestamping", + "--span-hosts", + "--convert-links", + "--adjust-extension", + "--page-requisites", + ] + else: + extra_wget_args = [] + + wget = subprocess.run([ + WGET_PATH, + *extra_wget_args, + "--quiet", + url + ], cwd=site_mirror_path) + + if wget.returncode != 0: + print(f"Warning: Site mirroring failed/incomplete.") + + creds = {} + if game_id in self.download_keys: + creds['download_key_id'] = self.download_keys[game_id] + print("Using {creds} for private uploads") + + game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15) + if not game_uploads_req.ok: + raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}") + + game_uploads = game_uploads_req.json()['uploads'] + print(f"Found {len(game_uploads)} upload(s)") + + try: + for upload in game_uploads: + upload_id = upload['id'] + file_name = upload['filename'] + file_size = upload['size'] + upload_is_external = upload['storage'] == 'external' + + print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...") + if upload_is_external: + print("***********************************************************") + print("* *") + print("* WARNING: External storage - downloads will likely fail. *") + print("* Check the URL displayed below manually! *") + print("* *") + print("***********************************************************") + + target_path = os.path.join(download_path, file_name) + try: + download_file(client, upload_id, target_path, creds, print_url=upload_is_external) + except ItchDownloadError as e: + jobs_failed.append((game_id, file_name, str(e))) + print(f"Download failed for {file_name}: {e}") + continue + + try: + actual_file_size = os.stat(target_path).st_size + if actual_file_size == file_size: + jobs_successful.append((game_id, file_name)) + else: + jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}")) + except FileNotFoundError: + jobs_failed.append((game_id, file_name, "Could not download file")) + + print(f"Done downloading {label}") + except ItchDownloadError as e: + failed_game_ids.append((game_id, str(e))) + print(message) + continue + except Exception as e: + print(f"Critical error while downloading {label}: {e}") + failed_game_ids.append((game_id, str(e))) + traceback.print_exc() + print(message) + continue + + successful_titles = {} + for game_id, file_name in jobs_successful: + if game_id not in successful_titles: + successful_titles[game_id] = [file_name] + + if any(successful_titles): + print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):") + for game_id, files in successful_titles.items(): + print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)") + + if any(jobs_failed): + print(f"\nDownloads failed for {len(jobs_failed)} file(s):") + for game_id, file_name, message in jobs_failed: + title, url = game_id_to_meta[game_id] + print(f"{title} - {file_name} - {message}") + print(f"Title URL: {url}") + + if any(failed_game_ids): + print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:") + for game_id, message in failed_game_ids: + title, url = game_id_to_meta[game_id] + print(f"{title} ({game_id}) - {url} - {message}") + + +# ------------------------------ +# --- OLD STUFF --- CUT HERE --- +# ------------------------------ + + +class GameAuthor(TypedDict, total=False): + name: str + url: str + + +class GameMetadata(TypedDict, total=False): + description: str + + +class GameDownloadJob(TypedDict, total=False): + url: str + game_id: int + title: str + author: GameAuthor + metadata: GameMetadata + + +class GameDownloader: + def __init__(self, download_to: str, api_key: str, keys: Dict[int, str]): + self.download_to = download_to + self.download_keys = keys + + self.client = ItchApiClient(api_key) + + def download(self, url: str): + job = GameDownloadJob(url=url) + raise NotImplementedError("Not yet!") + + +def drive_downloads(jobs: List[str], download_to: str, api_key: str, keys: Dict[int, str], parallel: int = 1): + downloader = GameDownloader(download_to, api_key, keys) + + if parallel > 1: + thread_map(downloader.download, jobs, max_workers=parallel, ) + else: + for job in tqdm(jobs): + downloader.download(job) diff --git a/itch_dl/handlers.py b/itch_dl/handlers.py new file mode 100644 index 0000000..184db57 --- /dev/null +++ b/itch_dl/handlers.py @@ -0,0 +1,218 @@ +import re +import json +import os.path +import logging +import urllib.parse +from typing import List, Optional + +from bs4 import BeautifulSoup + +from .api import ItchApiClient +from .consts import ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES, ItchDownloadError + + +def get_jobs_for_game_jam_json(game_jam_json: dict) -> List[str]: + if 'jam_games' not in game_jam_json: + raise Exception("Provided JSON is not a valid itch.io jam JSON.") + + return [g['game']['url'] for g in game_jam_json['jam_games']] + + +def get_int_after_marker_in_json(text: str, marker: str, key: str) -> Optional[int]: + """ + Many itch.io sites use a pattern like this: Most of the HTML page + is prerendered, but certain interactive objects are handled with + JavaScript initialized with `I.WidgetHandler({"id": 123, ...})` + somewhere near the end of each page. Those config blocks often + contain metadata like game/page IDs that we want to extract. + """ + marker_line: Optional[str] = None + for line in reversed(text.splitlines()): + marker_index = line.find(marker) + if marker_index != -1: + marker_line = line[marker_index:] + break + + if marker_line is None: + return None + + # Notice double-slashes in the f-string (not r-string)! + pattern = f'\\"{key}\\":\\s?(\\d+)' + + found_ints = re.findall(pattern, marker_line) + if len(found_ints) != 1: + return None + + return int(found_ints[0]) + + +def get_game_jam_json(jam_url: str, client: ItchApiClient) -> dict: + r = client.get(jam_url) + if not r.ok: + raise ItchDownloadError(f"Could not download the game jam site: {r.status_code} {r.reason}") + + jam_id: Optional[int] = get_int_after_marker_in_json(r.text, "I.ViewJam", "id") + if jam_id is None: + raise ItchDownloadError("Provided site did not contain the Game Jam ID. Provide " + "the path to the game jam entries JSON file instead, or " + "create an itch-dl issue with the Game Jam URL.") + + logging.info(f"Extracted Game Jam ID: {jam_id}") + r = client.get(f"{ITCH_URL}/jam/{jam_id}/entries.json") + if not r.ok: + raise ItchDownloadError(f"Could not download the game jam entries list: {r.status_code} {r.reason}") + + return r.json() + + +def get_jobs_for_browse_url(url: str, client: ItchApiClient) -> List[str]: + """ + Every browser page has a hidden RSS feed that can be accessed by + appending .xml to its URL. An optional "page" argument lets us + iterate over their contents. When no more elements are available, + the last returned has no children. + + The input URL is cleaned in the main URL handler, so append the + .xml?page=N suffix and iterate until we've caught 'em all. + """ + page = 1 + found_urls = set() + logging.info(f"Scraping game URLs from RSS feeds for %s", url) + + while True: + logging.info(f"Downloading page {page} (found {len(found_urls)} URLs total)") + r = client.get(f"{url}.xml?page={page}", append_api_key=False) + if not r.ok: + logging.info("RSS feed returned %s, finished.", r.reason) + break + + soup = BeautifulSoup(r.text, features="xml") + rss_items = soup.find_all("item") + if len(rss_items) < 1: + logging.info("No more items, finished.") + break + + logging.info(f"Found {len(rss_items)} items.") + for item in rss_items: + link_node = item.find("link") + if link_node is None: + continue + + node_url = link_node.text.strip() + if len(node_url) > 0: + found_urls.add(node_url) + + page += 1 + + if len(found_urls) == 0: + raise ItchDownloadError("No game URLs found to download.") + + return list(found_urls) + + +def get_jobs_for_itch_url(url: str, client: ItchApiClient) -> List[str]: + if url.startswith("http://"): + logging.info("HTTP link provided, upgrading to HTTPS") + url = "https://" + url[7:] + + if url.startswith(f"https://www.{ITCH_BASE}/"): + logging.info(f"Correcting www.{ITCH_BASE} to {ITCH_BASE}") + url = ITCH_URL + '/' + url[20:] + + url_parts = urllib.parse.urlparse(url) + url_path_parts: List[str] = [x for x in str(url_parts.path).split('/') if len(x) > 0] + + if url_parts.netloc == ITCH_BASE: + if len(url_path_parts) == 0: + raise NotImplementedError("itch-dl cannot download the entirety of itch.io.") + # (yet) (also leafo would not be happy with the bandwidth bill) + + site = url_path_parts[0] + + if site == "jam": # Game jams + if len(url_path_parts) < 2: + raise ValueError(f"Incomplete game jam URL: {url}") + + logging.info("Fetching Game Jam JSON...") + clean_game_jam_url = f"{ITCH_URL}/jam/{url_path_parts[1]}" + game_jam_json = get_game_jam_json(clean_game_jam_url, client) + return get_jobs_for_game_jam_json(game_jam_json) + + elif site in ITCH_BROWSER_TYPES: # Browser + clean_browse_url = '/'.join([ITCH_URL, *url_path_parts]) + return get_jobs_for_browse_url(clean_browse_url, client) + + elif site in ("b", "bundle"): # Bundles + raise NotImplementedError("itch-dl cannot download bundles yet.") + + elif site in ("j", "jobs"): # Jobs... + raise ValueError("itch-dl cannot download a job.") + + elif site in ("t", "board", "community"): # Forums + raise ValueError("itch-dl cannot download forums.") + + elif site == "profile": # Forum Profile + if len(url_path_parts) >= 2: + username = url_path_parts[1] + logging.info("Correcting user profile to creator page for %s", username) + return get_jobs_for_itch_url(f"https://{username}.{ITCH_BASE}", client) + + raise ValueError("itch-dl expects a username in profile links.") + + # Something else? + raise NotImplementedError(f"itch-dl does not understand \"{site}\" URLs. Please file a new issue.") + + elif url_parts.netloc.endswith(f".{ITCH_BASE}"): + if len(url_path_parts) == 0: # Author + # TODO: Find I.UserPage, regex for "user_id": [0-9]+, find the responsible API? + raise NotImplementedError("itch-dl cannot download author pages yet.") + + else: # Single game + # Just clean and return the URL: + return [f"https://{url_parts.netloc}/{url_path_parts[0]}"] + + else: + raise ValueError(f"Unknown domain: {url_parts.netloc}") + + +def get_jobs_for_path(path: str) -> List[str]: + try: # Game Jam Entries JSON? + with open(path) as f: + json_data = json.load(f) + + if not isinstance(json_data, dict): + raise ValueError(f"File does not contain a JSON dict: {path}") + + if 'jam_games' in json_data: + logging.info("Parsing provided file as a Game Jam Entries JSON...") + return get_jobs_for_game_jam_json(json_data) + except json.JSONDecodeError: + pass # Not a valid JSON, okay... + + url_list = [] + with open(path) as f: # Plain job list? + for line in f: + line = line.strip() + if line.startswith("https://") or line.startswith("http://"): + url_list.append(line) + + if len(url_list) > 0: + logging.info("Parsing provided file as a list of URLs to fetch...") + return url_list + + raise ValueError(f"File format is unknown - cannot read URLs to download.") + + +def get_jobs_for_url_or_path(path_or_url: str, api_key: str) -> List[str]: + """Returns a list of Game URLs for a given itch.io URL or file.""" + path_or_url = path_or_url.strip() + + if path_or_url.startswith("http://"): + logging.info("HTTP link provided, upgrading to HTTPS") + path_or_url = "https://" + path_or_url[7:] + + if path_or_url.startswith("https://"): + client = ItchApiClient(api_key) + return get_jobs_for_itch_url(path_or_url, client) + elif os.path.isfile(path_or_url): + return get_jobs_for_path(path_or_url) diff --git a/itch_dl/keys.py b/itch_dl/keys.py new file mode 100644 index 0000000..ed09694 --- /dev/null +++ b/itch_dl/keys.py @@ -0,0 +1,31 @@ +import logging +from typing import Dict + +from .api import ItchApiClient + + +def get_download_keys(client: ItchApiClient) -> Dict[int, str]: + logging.info("Fetching all download keys...") + download_keys = {} + page = 1 + + while True: + logging.info(f"Downloading page {page} (found {len(download_keys)} keys total)") + r = client.get("/profile/owned-keys", data={"page": page}, timeout=15) + if not r.ok: + break + + data = r.json() + if 'owned_keys' not in data: + break # Assuming we're out of keys already... + + for key in data['owned_keys']: + download_keys[key['game_id']] = key['id'] + + if len(data['owned_keys']) == data['per_page']: + page += 1 + else: + break + + logging.info(f"Fetched {len(download_keys)} download keys.") + return download_keys diff --git a/pyproject.toml b/pyproject.toml index 1b48b94..2f33141 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "itch-dl" -packages = [{ include = "itchdl" }] +packages = [{ include = "itch_dl" }] version = "0.1.0" description = "itch.io bulk game downloader" homepage = "https://github.com/DragoonAethis/itch-dl" @@ -24,11 +24,15 @@ classifiers = [ [tool.poetry.dependencies] python = "^3.8" -requests = "^2.26.0" -python-slugify = "^5.0.0" +tqdm = "^4.64.0" +urllib3 = "^1.26.9" +requests = "^2.27.1" +python-slugify = "^6.1.2" +beautifulsoup4 = "^4.11.1" +lxml = "^4.8.0" -[tool.poetry.dev-dependencies] -pytest = "^6.2" +[tool.poetry.scripts] +itch-dl = "itch_dl.cli:run" [build-system] requires = ["poetry-core>=1.0.0"]