diff --git a/.idea/itch-dl.iml b/.idea/itch-dl.iml
index 6fb469e..1ce8f6f 100644
--- a/.idea/itch-dl.iml
+++ b/.idea/itch-dl.iml
@@ -4,7 +4,7 @@
-
+
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 9454ea4..0ebb4a8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2021 Dragoon Aethis
+Copyright (c) 2022 Dragoon Aethis
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 2a7e6c2..492d022 100644
--- a/README.md
+++ b/README.md
@@ -1,43 +1,42 @@
# itch-dl
-Bulk download games from [itch.io](https://itch.io/). Currently only supports downloading game jams.
+Bulk download games from [itch.io](https://itch.io/).
-What you'll need:
-
-- Python 3.8+
-- `pip install -r requirements.txt`
-- For site mirroring, [wget](https://www.gnu.org/software/wget/) in your PATH.
-
-On Arch, `pacman -S wget python python-requests python-slugify` works.
+- Can download game jams, browse pages (popular, newest, browse by tag...) and individual games.
+- Requires Python 3.8+, grab it from PyPI: `pip install itch-dl`
+- For development, use [Poetry](https://python-poetry.org/).
+- Optionally requires wget for site mirroring.
How to use this:
- Log into itch.io with the account you'd like to use for downloading.
- Generate a new API key on your user account page: https://itch.io/user/settings/api-keys
-- Run the downloader: `python downloader.py --api-key https://itch.io/jam/yourjamhere`
+- Run the downloader: `itch-dl --api-key https://itch.io/jam/yourjamhere`
- Wait. This is going to take a while.
The downloader is able to grab more or less everything you can download via the itch app.
-It's expected that the downloader output will not be complete - logs are stupidly verbose and
-it prints a report on successful/failed downloads, so you must manually grab whatever was not
-handled for you automatically for some reason.
+The input URL can be any "Browse" page (top, popular, newest, filtered by tags, etc) or any
+game jam. The input can also be a path to a itch.io JSON file with game jam entries, or just
+a list of itch.io game URLs (not browse/jam pages!) to download.
-The downloader also grabs the entry page HTML, which usually comes with controls and such. It
-does not download images, external assets and so on, just the text - if the Itch page dies,
-so will most elements on those downloaded pages. Controls should survive, though.
+**It's expected that the downloader output will not be complete** - logs are stupidly verbose
+and it prints a report on successful/failed downloads, so you must manually grab whatever was
+not handled for you automatically for some reason.
-(There's a pedantic site mirroring toggle in the script, if you know what you're doing. You will
-need wget for that.)
+The downloader also grabs the entry page HTML, which usually comes with controls and such. By
+default, it does not download images, assets and so on, just the text - use `--mirror-web` to
+try and download these as well. This requires `wget` to be available in your `PATH`.
-## Cannot extract IDs?
+## Game Jam Entries JSON
-Downloader can parse and download games from a game jam entries JSON file if you want to provide it.
-(The script basically automates the steps below, so if it's not able to do the same, please create
-an issue!)
+Downloader can parse and download games from a game jam entries JSON file if you need it.
+(The script basically automates the steps below, so if it's not able to do the same, please
+create an issue!)
- Go to your jam's page, ex. https://itch.io/jam/gbcompo21 and right-click -> View Source.
- Ctrl-F for `"id":` - it should find that text once, followed by a number. Write it down.
-- (It you found it multiple times, grab the one after ViewJam something something.)
-- Download https://itch.io/jam/NUMBER/entries.json (replacing NUMBER with what you wrote down)
+- (It you found it multiple times, grab the one after I.ViewJam something something.)
+- Download https://itch.io/jam/ID/entries.json (replacing ID with what you wrote down).
+- Feed that to `itch-dl`!
diff --git a/downloader.py b/downloader.py
deleted file mode 100755
index 7b41acb..0000000
--- a/downloader.py
+++ /dev/null
@@ -1,349 +0,0 @@
-#!/usr/bin/env python3
-# Python 3.8+ and dependencies listed below required.
-import os
-import re
-import sys
-import json
-import time
-import shutil
-import hashlib
-import argparse
-import traceback
-import subprocess
-from enum import Enum
-
-import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry
-
-from slugify import slugify
-
-WGET_PATH = shutil.which("wget")
-if WGET_PATH is None:
- print(f"Warning: wget not available, site mirroring will not work!")
-
-# Try to download all site assets, images etc included.
-# You probably don't want this, but here you go!
-PEDANTIC_MIRRORING = False
-
-ITCH_API = "https://api.itch.io"
-
-
-class ItchDownloadResult(Enum):
- SUCCESS = 0
- FAILURE = 1
- MISSING_DOWNLOAD = 2
- DOWNLOAD_TIMEOUT = 3
-
-
-class ItchDownloadError(Exception):
- pass
-
-
-class ItchApiClient():
- def __init__(self, base_url: str, api_key: str):
- self.base_url = base_url
- self.api_key = api_key
-
- self.requests = requests.Session()
-
- retry_strategy = Retry(
- total=5,
- backoff_factor=10,
- allowed_methods=["HEAD", "GET"],
- status_forcelist=[429, 500, 502, 503, 504]
- )
-
- # No timeouts - set them explicitly on API calls below!
- adapter = HTTPAdapter(max_retries=retry_strategy)
- self.requests.mount("https://", adapter)
- self.requests.mount("http://", adapter)
-
- def add_api_key(self, kwargs):
- # Adds the API key to request params, if one was not
- # already provided outside of the client.
- if 'data' in kwargs:
- params = kwargs['data']
- else:
- params = {}
- kwargs['data'] = params
-
- if 'api_key' not in params:
- params['api_key'] = self.api_key
-
- def get(self, endpoint: str, *args, **kwargs):
- self.add_api_key(kwargs)
- return self.requests.get(self.base_url + endpoint, *args, **kwargs)
-
-
-def download_file(client: ItchApiClient, upload_id: int, download_path: str, creds: dict, print_url: bool=False):
- # No timeouts, chunked uploads, default retry strategy, should be all good?
- try:
- with client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r:
- r.raise_for_status()
- if print_url:
- print(f"Download URL: {r.url}")
-
- with open(download_path, 'wb') as f:
- for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks
- f.write(chunk)
- except requests.exceptions.HTTPError as e:
- raise ItchDownloadError(f"Unrecoverable download error: {e}")
-
-
-def get_download_keys(client: ItchApiClient):
- print("Fetching all download keys...")
- download_keys = {}
- page = 1
-
- while True:
- print(f"Downloading page {page}...")
- try:
- r = client.get("/profile/owned-keys", data={"page": page}, timeout=15)
- r.raise_for_status()
- except Exception as e:
- print(f"Got error while fetching download keys: {e}")
- print(f"Let's just pretend this is enough and move on...")
- break
-
- data = r.json()
- if 'owned_keys' not in data:
- break # Assuming we're out of keys already...
-
- for key in data['owned_keys']:
- download_keys[key['game_id']] = key['id']
-
- if len(data['owned_keys']) == data['per_page']:
- page += 1
- else:
- break
-
- print(f"Fetched {len(download_keys)} download keys.")
- return download_keys
-
-
-def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]:
- if 'jam_games' not in jam_json:
- raise Exception("Provided JSON is not a valid itch.io jam JSON.")
-
- # Extract (id, url) pairs from all the entries.
- return [(int(e['game']['id']), e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]
-
-
-def get_game_jam_json(jam_path: str) -> dict:
- # Do we have an URL?
- jam_path = jam_path.strip()
- if jam_path.startswith("https://") or jam_path.startswith("http://"):
- r = requests.get(jam_path)
- if not r.ok:
- raise Exception(f"Could not download game jam site from {jam_path} (code {r.status_code}): {r.reason}")
-
- jam_id_line = None
- for line in r.text.splitlines():
- if "ViewJam" in line:
- jam_id_line = line
-
- if jam_id_line is None:
- raise Exception(f"Jam site did not contain the ID line - please provide the path to the game jam entries JSON file instead.")
-
- found_ids = re.findall(r'\"id\":([0-9]+)', jam_id_line)
- if len(found_ids) == 0:
- raise Exception(f"Could not extract the jam ID from the provided site.")
-
- jam_id = int(found_ids[0]) # Always grab the first one for now...
- print(f"Extracted jam ID: {jam_id}")
-
- r = requests.get(f"https://itch.io/jam/{jam_id}/entries.json")
- if not r.ok:
- raise Exception(f"Could not download the game jam entries list.")
-
- content = r.text
- elif os.path.isfile(jam_path):
- try:
- with open(jam_path) as f:
- content = f.read()
- except Exception as e:
- raise Exception(f"Could not open/read the game jam entries file: {e}")
- else:
- raise Exception(f"Provided game jam path is invalid (not a link/existing file).")
-
- try:
- jam_json = json.loads(content)
- except json.decoder.JSONDecodeError:
- print(f"Provided game jam entries file is not a valid JSON file.")
-
- return jam_json
-
-
-def download_jam(jam_path: str, download_to: str, api_key: str, continue_from: str=None):
- client = ItchApiClient(ITCH_API, api_key)
- jam_json = get_game_jam_json(jam_path)
-
- # Check API key validity:
- profile_req = client.get("/profile")
- if not profile_req.ok:
- print(f"Provided API key appears to be invalid: {profile_req.text}")
- exit(1)
-
- jobs = parse_jobs(jam_json)
- jobs_successful = []
- jobs_failed = []
-
- download_keys = get_download_keys(client)
- game_id_to_meta = {} # dict[game_id: int, (title: str, url: str)]
-
- for game_id, title, url in jobs:
- game_id_to_meta[game_id] = (title, url)
-
- failed_game_ids = set()
-
- # No "continue from"? Yep, start right away.
- should_process_jobs = continue_from is None
-
- for game_id, title, url in jobs:
- label = f"{title} ({game_id})"
- if not should_process_jobs:
- if game_id == continue_from:
- should_process_jobs = True
- else:
- continue
-
- try:
- download_path = os.path.join(download_to, slugify(title))
- if PEDANTIC_MIRRORING:
- site_mirror_path = os.path.join(download_to, "_sites")
- else:
- site_mirror_path = os.path.join(download_path, "site")
- os.makedirs(download_path, exist_ok=True)
- os.makedirs(site_mirror_path, exist_ok=True)
- except:
- raise ItchDownloadError(f"Could not create download directory: {download_path}")
-
- print(f"Trying to download {label} to {download_path}")
-
- if WGET_PATH is not None:
- print("Downloading site...")
- if PEDANTIC_MIRRORING:
- extra_wget_args = [
- "--timestamping",
- "--span-hosts",
- "--convert-links",
- "--adjust-extension",
- "--page-requisites",
- ]
- else:
- extra_wget_args = []
-
- wget = subprocess.run([
- WGET_PATH,
- *extra_wget_args,
- "--quiet",
- url
- ], cwd=site_mirror_path)
-
- if wget.returncode != 0:
- print(f"Warning: Site mirroring failed/incomplete.")
-
- creds = {}
- if game_id in download_keys:
- creds['download_key_id'] = download_keys[game_id]
- print("Using {creds} for private uploads")
-
- game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15)
- if not game_uploads_req.ok:
- raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}")
-
- game_uploads = game_uploads_req.json()['uploads']
- print(f"Found {len(game_uploads)} upload(s)")
-
- try:
- for upload in game_uploads:
- upload_id = upload['id']
- file_name = upload['filename']
- file_size = upload['size']
- upload_is_external = upload['storage'] == 'external'
-
- print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...")
- if upload_is_external:
- print("***********************************************************")
- print("* *")
- print("* WARNING: External storage - downloads will likely fail. *")
- print("* Check the URL displayed below manually! *")
- print("* *")
- print("***********************************************************")
-
- target_path = os.path.join(download_path, file_name)
- try:
- download_file(client, upload_id, target_path, creds, print_url=upload_is_external)
- except ItchDownloadError as e:
- jobs_failed.append((game_id, file_name, str(e)))
- print(f"Download failed for {file_name}: {e}")
- continue
-
- try:
- actual_file_size = os.stat(target_path).st_size
- if actual_file_size == file_size:
- jobs_successful.append((game_id, file_name))
- else:
- jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}"))
- except FileNotFoundError:
- jobs_failed.append((game_id, file_name, "Could not download file"))
-
- print(f"Done downloading {label}")
- except ItchDownloadError as e:
- failed_game_ids.append((game_id, str(e)))
- print(message)
- continue
- except Exception as e:
- print(f"Critical error while downloading {label}: {e}")
- failed_game_ids.append((game_id, str(e)))
- traceback.print_exc()
- print(message)
- continue
-
- successful_titles = {}
- for game_id, file_name in jobs_successful:
- if game_id not in successful_titles:
- successful_titles[game_id] = [file_name]
-
- if any(successful_titles):
- print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):")
- for game_id, files in successful_titles.items():
- print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)")
-
- if any(jobs_failed):
- print(f"\nDownloads failed for {len(jobs_failed)} file(s):")
- for game_id, file_name, message in jobs_failed:
- title, url = game_id_to_meta[game_id]
- print(f"{title} - {file_name} - {message}")
- print(f"Title URL: {url}")
-
- if any(failed_game_ids):
- print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:")
- for game_id, message in failed_game_ids:
- title, url = game_id_to_meta[game_id]
- print(f"{title} ({game_id}) - {url} - {message}")
-
-
-def get_parser():
- parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.")
- parser.add_argument("entries", help="path to the game jam entries.json file")
- parser.add_argument("--api-key", metavar="key", required=True, help="itch.io API key from https://itch.io/user/settings/api-keys")
- parser.add_argument("--download-to", metavar="path", help="directory to save results into (default: current dir)")
- parser.add_argument("--continue-from", metavar="id", type=int, help="skip all entries until the provided entry ID is found")
- return parser
-
-
-def get_download_dir(args: argparse.Namespace) -> str:
- download_to = os.getcwd()
- if args.download_to is not None:
- download_to = os.path.normpath(args.download_to)
- os.makedirs(download_to)
-
- return download_to
-
-
-if __name__ == "__main__":
- args = get_parser().parse_args()
- download_to = get_download_dir(args)
- download_jam(args.entries, download_to, args.api_key, continue_from=args.continue_from)
diff --git a/itch_dl/__init__.py b/itch_dl/__init__.py
new file mode 100644
index 0000000..b794fd4
--- /dev/null
+++ b/itch_dl/__init__.py
@@ -0,0 +1 @@
+__version__ = '0.1.0'
diff --git a/itch_dl/__main__.py b/itch_dl/__main__.py
new file mode 100644
index 0000000..e0f1582
--- /dev/null
+++ b/itch_dl/__main__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python3
+from itch_dl.cli import run
+run()
diff --git a/itch_dl/api.py b/itch_dl/api.py
new file mode 100644
index 0000000..21be0f2
--- /dev/null
+++ b/itch_dl/api.py
@@ -0,0 +1,43 @@
+from typing import Optional
+
+from requests import Session
+from urllib3.util.retry import Retry
+from requests.adapters import HTTPAdapter
+
+from .consts import ITCH_API
+
+
+class ItchApiClient:
+ def __init__(self, api_key: str, base_url: Optional[str] = None):
+ self.base_url = base_url or ITCH_API
+ self.api_key = api_key
+
+ self.requests = Session()
+
+ retry_strategy = Retry(
+ total=5,
+ backoff_factor=10,
+ allowed_methods=["HEAD", "GET"],
+ status_forcelist=[429, 500, 502, 503, 504]
+ )
+
+ # No timeouts - set them explicitly on API calls below!
+ adapter = HTTPAdapter(max_retries=retry_strategy)
+ self.requests.mount("https://", adapter)
+ self.requests.mount("http://", adapter)
+
+ def get(self, endpoint: str, append_api_key: bool = True, **kwargs):
+ if append_api_key:
+ params = kwargs.get('data') or {}
+
+ if 'api_key' not in params:
+ params['api_key'] = self.api_key
+
+ kwargs['data'] = params
+
+ if endpoint.startswith("https://"):
+ url = endpoint
+ else:
+ url = self.base_url + endpoint
+
+ return self.requests.get(url, **kwargs)
diff --git a/itch_dl/cli.py b/itch_dl/cli.py
new file mode 100644
index 0000000..8a1b1ef
--- /dev/null
+++ b/itch_dl/cli.py
@@ -0,0 +1,67 @@
+import os
+import logging
+import argparse
+
+from .handlers import get_jobs_for_url_or_path
+from .downloader import drive_downloads
+from .keys import get_download_keys
+from .api import ItchApiClient
+logging.basicConfig()
+logging.getLogger().setLevel(logging.INFO)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Bulk download stuff from Itch.io.")
+ parser.add_argument("url_or_path",
+ help="itch.io URL or path to a game jam entries.json file")
+ parser.add_argument("--api-key", metavar="key", required=True,
+ help="itch.io API key - https://itch.io/user/settings/api-keys")
+ parser.add_argument("--urls-only", action="store_true",
+ help="print scraped game URLs without downloading them")
+ parser.add_argument("--download-to", metavar="path",
+ help="directory to save results into (default: current dir)")
+ parser.add_argument("--parallel", metavar="parallel", type=int, default=1,
+ help="how many threads to use for downloading games (default: 1)")
+ parser.add_argument("--mirror-web", action="store_true",
+ help="try to fetch assets on game sites")
+ parser.add_argument("--verbose", action="store_true",
+ help="print verbose logs")
+ return parser.parse_args()
+
+
+def run() -> int:
+ args = parse_args()
+ if args.verbose:
+ logging.getLogger().setLevel(logging.DEBUG)
+
+ jobs = get_jobs_for_url_or_path(args.url_or_path, args.api_key)
+ jobs = list(set(jobs)) # Deduplicate, just in case...
+ logging.info(f"Found {len(jobs)} URL(s).")
+
+ if len(jobs) == 0:
+ print("No URLs to download.")
+ return 1
+
+ if args.urls_only:
+ for job in jobs:
+ print(job)
+
+ return 0
+
+ download_to = os.getcwd()
+ if args.download_to is not None:
+ download_to = os.path.normpath(args.download_to)
+ os.makedirs(download_to, exist_ok=True)
+
+ client = ItchApiClient(args.api_key)
+
+ # Check API key validity:
+ profile_req = client.get("/profile")
+ if not profile_req.ok:
+ print(f"Provided API key appears to be invalid: {profile_req.text}")
+ exit(1)
+
+ # Grab all the download keys (there's no way to fetch them per title...):
+ keys = get_download_keys(client)
+
+ return drive_downloads(jobs, download_to, args.api_key, keys, parallel=args.parallel)
diff --git a/itch_dl/consts.py b/itch_dl/consts.py
new file mode 100644
index 0000000..83d8ac8
--- /dev/null
+++ b/itch_dl/consts.py
@@ -0,0 +1,29 @@
+from enum import Enum
+
+ITCH_BASE = "itch.io"
+ITCH_URL = f"https://{ITCH_BASE}"
+ITCH_API = f"https://api.{ITCH_BASE}"
+
+ITCH_BROWSER_TYPES = [
+ "games",
+ "tools",
+ "game-assets",
+ "comics",
+ "books",
+ "physical-games",
+ "soundtracks",
+ "game-mods",
+ "misc",
+]
+
+
+class ItchDownloadResult(Enum):
+ SUCCESS = 0
+ FAILURE = 1
+ MISSING_DOWNLOAD = 2
+ DOWNLOAD_TIMEOUT = 3
+
+
+# I mean, not really a const but eh
+class ItchDownloadError(Exception):
+ pass
diff --git a/itch_dl/downloader.py b/itch_dl/downloader.py
new file mode 100644
index 0000000..79e5c95
--- /dev/null
+++ b/itch_dl/downloader.py
@@ -0,0 +1,251 @@
+import os
+import shutil
+import logging
+import traceback
+import subprocess
+from typing import Tuple, List, Dict, TypedDict, Optional
+
+from slugify import slugify
+from requests.exceptions import HTTPError
+
+from tqdm import tqdm
+from tqdm.contrib.concurrent import thread_map
+
+from .api import ItchApiClient
+from .consts import ItchDownloadError, ItchDownloadResult
+
+
+# ------------------------------
+# --- OLD STUFF --- CUT HERE ---
+# ------------------------------
+
+
+WGET_PATH = shutil.which("wget")
+if WGET_PATH is None:
+ print(f"Warning: wget not available, site mirroring will not work!")
+
+
+def download_file(client: ItchApiClient, upload_id: int, download_path: str, creds: dict, print_url: bool=False):
+ # No timeouts, chunked uploads, default retry strategy, should be all good?
+ try:
+ with client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r:
+ r.raise_for_status()
+ if print_url:
+ print(f"Download URL: {r.url}")
+
+ with open(download_path, 'wb') as f:
+ for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks
+ f.write(chunk)
+ except HTTPError as e:
+ raise ItchDownloadError(f"Unrecoverable download error: {e}")
+
+
+def get_meta_for_game_url(game_url: str) -> Tuple[int, str]:
+ """Finds the Game ID and Title for a Game URL."""
+ data_url = game_url.rstrip("/") + "/data.json"
+ data_req = requests.get(data_url)
+ r.raise_for_status()
+
+ data_json = data_req.json()
+ if not 'id' in data_json:
+ raise ItchDownloadError(f"Cannot fetch the Game ID for URL: {game_url}")
+
+ return data_json['id']
+
+
+
+
+
+
+
+def download_jam(jam_path: str, download_to: str, api_key: str, continue_from: str=None):
+ client = ItchApiClient(api_key)
+ jam_json = get_game_jam_json(jam_path)
+
+ # Check API key validity:
+ profile_req = client.get("/profile")
+ if not profile_req.ok:
+ print(f"Provided API key appears to be invalid: {profile_req.text}")
+ exit(1)
+
+ jobs = parse_jobs(jam_json)
+ jobs_successful = []
+ jobs_failed = []
+
+ game_id_to_meta = {} # dict[game_id: int, (title: str, url: str)]
+
+ for game_id, title, url in jobs:
+ game_id_to_meta[game_id] = (title, url)
+
+ failed_game_ids = set()
+
+ # No "continue from"? Yep, start right away.
+ should_process_jobs = continue_from is None
+
+ for game_id, title, url in jobs:
+ label = f"{title} ({game_id})"
+ if not should_process_jobs:
+ if game_id == continue_from:
+ should_process_jobs = True
+ else:
+ continue
+
+ try:
+ download_path = os.path.join(download_to, slugify(title))
+ if PEDANTIC_MIRRORING:
+ site_mirror_path = os.path.join(download_to, "_sites")
+ else:
+ site_mirror_path = os.path.join(download_path, "site")
+ os.makedirs(download_path, exist_ok=True)
+ os.makedirs(site_mirror_path, exist_ok=True)
+ except:
+ raise ItchDownloadError(f"Could not create download directory: {download_path}")
+
+ print(f"Trying to download {label} to {download_path}")
+
+ if WGET_PATH is not None:
+ print("Downloading site...")
+ if PEDANTIC_MIRRORING:
+ extra_wget_args = [
+ "--timestamping",
+ "--span-hosts",
+ "--convert-links",
+ "--adjust-extension",
+ "--page-requisites",
+ ]
+ else:
+ extra_wget_args = []
+
+ wget = subprocess.run([
+ WGET_PATH,
+ *extra_wget_args,
+ "--quiet",
+ url
+ ], cwd=site_mirror_path)
+
+ if wget.returncode != 0:
+ print(f"Warning: Site mirroring failed/incomplete.")
+
+ creds = {}
+ if game_id in self.download_keys:
+ creds['download_key_id'] = self.download_keys[game_id]
+ print("Using {creds} for private uploads")
+
+ game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15)
+ if not game_uploads_req.ok:
+ raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}")
+
+ game_uploads = game_uploads_req.json()['uploads']
+ print(f"Found {len(game_uploads)} upload(s)")
+
+ try:
+ for upload in game_uploads:
+ upload_id = upload['id']
+ file_name = upload['filename']
+ file_size = upload['size']
+ upload_is_external = upload['storage'] == 'external'
+
+ print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...")
+ if upload_is_external:
+ print("***********************************************************")
+ print("* *")
+ print("* WARNING: External storage - downloads will likely fail. *")
+ print("* Check the URL displayed below manually! *")
+ print("* *")
+ print("***********************************************************")
+
+ target_path = os.path.join(download_path, file_name)
+ try:
+ download_file(client, upload_id, target_path, creds, print_url=upload_is_external)
+ except ItchDownloadError as e:
+ jobs_failed.append((game_id, file_name, str(e)))
+ print(f"Download failed for {file_name}: {e}")
+ continue
+
+ try:
+ actual_file_size = os.stat(target_path).st_size
+ if actual_file_size == file_size:
+ jobs_successful.append((game_id, file_name))
+ else:
+ jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}"))
+ except FileNotFoundError:
+ jobs_failed.append((game_id, file_name, "Could not download file"))
+
+ print(f"Done downloading {label}")
+ except ItchDownloadError as e:
+ failed_game_ids.append((game_id, str(e)))
+ print(message)
+ continue
+ except Exception as e:
+ print(f"Critical error while downloading {label}: {e}")
+ failed_game_ids.append((game_id, str(e)))
+ traceback.print_exc()
+ print(message)
+ continue
+
+ successful_titles = {}
+ for game_id, file_name in jobs_successful:
+ if game_id not in successful_titles:
+ successful_titles[game_id] = [file_name]
+
+ if any(successful_titles):
+ print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):")
+ for game_id, files in successful_titles.items():
+ print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)")
+
+ if any(jobs_failed):
+ print(f"\nDownloads failed for {len(jobs_failed)} file(s):")
+ for game_id, file_name, message in jobs_failed:
+ title, url = game_id_to_meta[game_id]
+ print(f"{title} - {file_name} - {message}")
+ print(f"Title URL: {url}")
+
+ if any(failed_game_ids):
+ print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:")
+ for game_id, message in failed_game_ids:
+ title, url = game_id_to_meta[game_id]
+ print(f"{title} ({game_id}) - {url} - {message}")
+
+
+# ------------------------------
+# --- OLD STUFF --- CUT HERE ---
+# ------------------------------
+
+
+class GameAuthor(TypedDict, total=False):
+ name: str
+ url: str
+
+
+class GameMetadata(TypedDict, total=False):
+ description: str
+
+
+class GameDownloadJob(TypedDict, total=False):
+ url: str
+ game_id: int
+ title: str
+ author: GameAuthor
+ metadata: GameMetadata
+
+
+class GameDownloader:
+ def __init__(self, download_to: str, api_key: str, keys: Dict[int, str]):
+ self.download_to = download_to
+ self.download_keys = keys
+
+ self.client = ItchApiClient(api_key)
+
+ def download(self, url: str):
+ job = GameDownloadJob(url=url)
+ raise NotImplementedError("Not yet!")
+
+
+def drive_downloads(jobs: List[str], download_to: str, api_key: str, keys: Dict[int, str], parallel: int = 1):
+ downloader = GameDownloader(download_to, api_key, keys)
+
+ if parallel > 1:
+ thread_map(downloader.download, jobs, max_workers=parallel, )
+ else:
+ for job in tqdm(jobs):
+ downloader.download(job)
diff --git a/itch_dl/handlers.py b/itch_dl/handlers.py
new file mode 100644
index 0000000..184db57
--- /dev/null
+++ b/itch_dl/handlers.py
@@ -0,0 +1,218 @@
+import re
+import json
+import os.path
+import logging
+import urllib.parse
+from typing import List, Optional
+
+from bs4 import BeautifulSoup
+
+from .api import ItchApiClient
+from .consts import ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES, ItchDownloadError
+
+
+def get_jobs_for_game_jam_json(game_jam_json: dict) -> List[str]:
+ if 'jam_games' not in game_jam_json:
+ raise Exception("Provided JSON is not a valid itch.io jam JSON.")
+
+ return [g['game']['url'] for g in game_jam_json['jam_games']]
+
+
+def get_int_after_marker_in_json(text: str, marker: str, key: str) -> Optional[int]:
+ """
+ Many itch.io sites use a pattern like this: Most of the HTML page
+ is prerendered, but certain interactive objects are handled with
+ JavaScript initialized with `I.WidgetHandler({"id": 123, ...})`
+ somewhere near the end of each page. Those config blocks often
+ contain metadata like game/page IDs that we want to extract.
+ """
+ marker_line: Optional[str] = None
+ for line in reversed(text.splitlines()):
+ marker_index = line.find(marker)
+ if marker_index != -1:
+ marker_line = line[marker_index:]
+ break
+
+ if marker_line is None:
+ return None
+
+ # Notice double-slashes in the f-string (not r-string)!
+ pattern = f'\\"{key}\\":\\s?(\\d+)'
+
+ found_ints = re.findall(pattern, marker_line)
+ if len(found_ints) != 1:
+ return None
+
+ return int(found_ints[0])
+
+
+def get_game_jam_json(jam_url: str, client: ItchApiClient) -> dict:
+ r = client.get(jam_url)
+ if not r.ok:
+ raise ItchDownloadError(f"Could not download the game jam site: {r.status_code} {r.reason}")
+
+ jam_id: Optional[int] = get_int_after_marker_in_json(r.text, "I.ViewJam", "id")
+ if jam_id is None:
+ raise ItchDownloadError("Provided site did not contain the Game Jam ID. Provide "
+ "the path to the game jam entries JSON file instead, or "
+ "create an itch-dl issue with the Game Jam URL.")
+
+ logging.info(f"Extracted Game Jam ID: {jam_id}")
+ r = client.get(f"{ITCH_URL}/jam/{jam_id}/entries.json")
+ if not r.ok:
+ raise ItchDownloadError(f"Could not download the game jam entries list: {r.status_code} {r.reason}")
+
+ return r.json()
+
+
+def get_jobs_for_browse_url(url: str, client: ItchApiClient) -> List[str]:
+ """
+ Every browser page has a hidden RSS feed that can be accessed by
+ appending .xml to its URL. An optional "page" argument lets us
+ iterate over their contents. When no more elements are available,
+ the last returned has no - children.
+
+ The input URL is cleaned in the main URL handler, so append the
+ .xml?page=N suffix and iterate until we've caught 'em all.
+ """
+ page = 1
+ found_urls = set()
+ logging.info(f"Scraping game URLs from RSS feeds for %s", url)
+
+ while True:
+ logging.info(f"Downloading page {page} (found {len(found_urls)} URLs total)")
+ r = client.get(f"{url}.xml?page={page}", append_api_key=False)
+ if not r.ok:
+ logging.info("RSS feed returned %s, finished.", r.reason)
+ break
+
+ soup = BeautifulSoup(r.text, features="xml")
+ rss_items = soup.find_all("item")
+ if len(rss_items) < 1:
+ logging.info("No more items, finished.")
+ break
+
+ logging.info(f"Found {len(rss_items)} items.")
+ for item in rss_items:
+ link_node = item.find("link")
+ if link_node is None:
+ continue
+
+ node_url = link_node.text.strip()
+ if len(node_url) > 0:
+ found_urls.add(node_url)
+
+ page += 1
+
+ if len(found_urls) == 0:
+ raise ItchDownloadError("No game URLs found to download.")
+
+ return list(found_urls)
+
+
+def get_jobs_for_itch_url(url: str, client: ItchApiClient) -> List[str]:
+ if url.startswith("http://"):
+ logging.info("HTTP link provided, upgrading to HTTPS")
+ url = "https://" + url[7:]
+
+ if url.startswith(f"https://www.{ITCH_BASE}/"):
+ logging.info(f"Correcting www.{ITCH_BASE} to {ITCH_BASE}")
+ url = ITCH_URL + '/' + url[20:]
+
+ url_parts = urllib.parse.urlparse(url)
+ url_path_parts: List[str] = [x for x in str(url_parts.path).split('/') if len(x) > 0]
+
+ if url_parts.netloc == ITCH_BASE:
+ if len(url_path_parts) == 0:
+ raise NotImplementedError("itch-dl cannot download the entirety of itch.io.")
+ # (yet) (also leafo would not be happy with the bandwidth bill)
+
+ site = url_path_parts[0]
+
+ if site == "jam": # Game jams
+ if len(url_path_parts) < 2:
+ raise ValueError(f"Incomplete game jam URL: {url}")
+
+ logging.info("Fetching Game Jam JSON...")
+ clean_game_jam_url = f"{ITCH_URL}/jam/{url_path_parts[1]}"
+ game_jam_json = get_game_jam_json(clean_game_jam_url, client)
+ return get_jobs_for_game_jam_json(game_jam_json)
+
+ elif site in ITCH_BROWSER_TYPES: # Browser
+ clean_browse_url = '/'.join([ITCH_URL, *url_path_parts])
+ return get_jobs_for_browse_url(clean_browse_url, client)
+
+ elif site in ("b", "bundle"): # Bundles
+ raise NotImplementedError("itch-dl cannot download bundles yet.")
+
+ elif site in ("j", "jobs"): # Jobs...
+ raise ValueError("itch-dl cannot download a job.")
+
+ elif site in ("t", "board", "community"): # Forums
+ raise ValueError("itch-dl cannot download forums.")
+
+ elif site == "profile": # Forum Profile
+ if len(url_path_parts) >= 2:
+ username = url_path_parts[1]
+ logging.info("Correcting user profile to creator page for %s", username)
+ return get_jobs_for_itch_url(f"https://{username}.{ITCH_BASE}", client)
+
+ raise ValueError("itch-dl expects a username in profile links.")
+
+ # Something else?
+ raise NotImplementedError(f"itch-dl does not understand \"{site}\" URLs. Please file a new issue.")
+
+ elif url_parts.netloc.endswith(f".{ITCH_BASE}"):
+ if len(url_path_parts) == 0: # Author
+ # TODO: Find I.UserPage, regex for "user_id": [0-9]+, find the responsible API?
+ raise NotImplementedError("itch-dl cannot download author pages yet.")
+
+ else: # Single game
+ # Just clean and return the URL:
+ return [f"https://{url_parts.netloc}/{url_path_parts[0]}"]
+
+ else:
+ raise ValueError(f"Unknown domain: {url_parts.netloc}")
+
+
+def get_jobs_for_path(path: str) -> List[str]:
+ try: # Game Jam Entries JSON?
+ with open(path) as f:
+ json_data = json.load(f)
+
+ if not isinstance(json_data, dict):
+ raise ValueError(f"File does not contain a JSON dict: {path}")
+
+ if 'jam_games' in json_data:
+ logging.info("Parsing provided file as a Game Jam Entries JSON...")
+ return get_jobs_for_game_jam_json(json_data)
+ except json.JSONDecodeError:
+ pass # Not a valid JSON, okay...
+
+ url_list = []
+ with open(path) as f: # Plain job list?
+ for line in f:
+ line = line.strip()
+ if line.startswith("https://") or line.startswith("http://"):
+ url_list.append(line)
+
+ if len(url_list) > 0:
+ logging.info("Parsing provided file as a list of URLs to fetch...")
+ return url_list
+
+ raise ValueError(f"File format is unknown - cannot read URLs to download.")
+
+
+def get_jobs_for_url_or_path(path_or_url: str, api_key: str) -> List[str]:
+ """Returns a list of Game URLs for a given itch.io URL or file."""
+ path_or_url = path_or_url.strip()
+
+ if path_or_url.startswith("http://"):
+ logging.info("HTTP link provided, upgrading to HTTPS")
+ path_or_url = "https://" + path_or_url[7:]
+
+ if path_or_url.startswith("https://"):
+ client = ItchApiClient(api_key)
+ return get_jobs_for_itch_url(path_or_url, client)
+ elif os.path.isfile(path_or_url):
+ return get_jobs_for_path(path_or_url)
diff --git a/itch_dl/keys.py b/itch_dl/keys.py
new file mode 100644
index 0000000..ed09694
--- /dev/null
+++ b/itch_dl/keys.py
@@ -0,0 +1,31 @@
+import logging
+from typing import Dict
+
+from .api import ItchApiClient
+
+
+def get_download_keys(client: ItchApiClient) -> Dict[int, str]:
+ logging.info("Fetching all download keys...")
+ download_keys = {}
+ page = 1
+
+ while True:
+ logging.info(f"Downloading page {page} (found {len(download_keys)} keys total)")
+ r = client.get("/profile/owned-keys", data={"page": page}, timeout=15)
+ if not r.ok:
+ break
+
+ data = r.json()
+ if 'owned_keys' not in data:
+ break # Assuming we're out of keys already...
+
+ for key in data['owned_keys']:
+ download_keys[key['game_id']] = key['id']
+
+ if len(data['owned_keys']) == data['per_page']:
+ page += 1
+ else:
+ break
+
+ logging.info(f"Fetched {len(download_keys)} download keys.")
+ return download_keys
diff --git a/pyproject.toml b/pyproject.toml
index 1b48b94..2f33141 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "itch-dl"
-packages = [{ include = "itchdl" }]
+packages = [{ include = "itch_dl" }]
version = "0.1.0"
description = "itch.io bulk game downloader"
homepage = "https://github.com/DragoonAethis/itch-dl"
@@ -24,11 +24,15 @@ classifiers = [
[tool.poetry.dependencies]
python = "^3.8"
-requests = "^2.26.0"
-python-slugify = "^5.0.0"
+tqdm = "^4.64.0"
+urllib3 = "^1.26.9"
+requests = "^2.27.1"
+python-slugify = "^6.1.2"
+beautifulsoup4 = "^4.11.1"
+lxml = "^4.8.0"
-[tool.poetry.dev-dependencies]
-pytest = "^6.2"
+[tool.poetry.scripts]
+itch-dl = "itch_dl.cli:run"
[build-system]
requires = ["poetry-core>=1.0.0"]