Trial The Second: Make leafo happy by not murdering his servers

Drop Selenium and clicking around itch - use the API instead. Pulls HTML5 games, is way faster, uses less resources on both ends.
2025-01-22 01:41:11 +01:00 · 2021-10-03 03:49:49 +02:00 · 2021-10-03 03:49:49 +02:00 · f86044050f
commit f86044050f
parent 5d0b8e1e99
3 changed files with 243 additions and 172 deletions
--- a/README.md
+++ b/README.md
@ -6,20 +6,21 @@ What you'll need:
 - Python 3.8+
 - `pip install -r requirements.txt`
- [chromedriver](https://chromedriver.chromium.org/downloads) somewhere in your PATH
+- For site mirroring, [wget](https://www.gnu.org/software/wget/) in your PATH.
-On Arch, `pacman -S python chromium python-selenium python-requests python-slugify` works.
+On Arch, `pacman -S wget python python-requests python-slugify` works.
 How to use this:
 - Go to your jam's page, ex. https://itch.io/jam/gbcompo21 and right-click -> View Source.
 - Ctrl-F for `"id":` - it should find that text once, followed by a number. Write it down.
 - (It you found it multiple times, grab the one after ViewJam something something.)
 - Download https://itch.io/jam/NUMBER/entries.json (replacing NUMBER with what you wrote down)
- Run the downloader: `python downloader.py entries.json`
+- Generate a new API key on your user account page: https://itch.io/user/settings/api-keys
 - Run the downloader: `python downloader.py --api-key <KEY> entries.json`
 - Wait. This is going to take a while.
-**This downloader does not (and probably will not) support HTML5-only games.** (For some of
+The downloader is able to grab more or less everything you can download via the itch app.
 these, you might get lucky by hitting F12 while the game loads and grabbing what's in there.)
 It's expected that the downloader output will not be complete - logs are stupidly verbose and
 it prints a report on successful/failed downloads, so you must manually grab whatever was not
@ -28,3 +29,5 @@ handled for you automatically for some reason.
 The downloader also grabs the entry page HTML, which usually comes with controls and such. It
 does not download images, external assets and so on, just the text - if the Itch page dies,
 so will most elements on those downloaded pages. Controls should survive, though.
 (There's a pedantic mirroring toggle in the script, if you know what you're doing though.)
--- a/downloader.py
+++ b/downloader.py
@ -4,21 +4,28 @@ import os
 import sys
 import json
 import time
 import shutil
 import hashlib
 import argparse
 import traceback
 import subprocess
 from enum import Enum
 from multiprocessing import Pool
 import requests
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 from slugify import slugify
-from selenium import webdriver
+WGET_PATH = shutil.which("wget")
-from selenium.webdriver.chrome.options import Options
+if WGET_PATH is None:
-from selenium.webdriver.common.by import By
+	print(f"Warning: wget not available, site mirroring will not work!")
-from selenium.webdriver.support.ui import WebDriverWait
+
-from selenium.webdriver.support import expected_conditions as EC
+# Try to download all site assets, images etc included.
-from selenium.common.exceptions import NoSuchElementException, TimeoutException
+# You probably don't want this, but here you go!
 PEDANTIC_MIRRORING = False
 ITCH_API = "https://api.itch.io"
 class ItchDownloadResult(Enum):
@ -28,206 +35,263 @@ class ItchDownloadResult(Enum):
 	DOWNLOAD_TIMEOUT = 3
 class ItchDownloadError(Exception):
 	pass
 class ItchApiClient():
 	def __init__(self, base_url: str, api_key: str):
 		self.base_url = base_url
 		self.api_key = api_key
 		self.requests = requests.Session()
 		retry_strategy = Retry(
 			total=5,
 			backoff_factor=10,
 			allowed_methods=["HEAD", "GET"],
 			status_forcelist=[429, 500, 502, 503, 504]
 		)
 		# No timeouts - set them explicitly on API calls below!
 		adapter = HTTPAdapter(max_retries=retry_strategy)
 		self.requests.mount("https://", adapter)
 		self.requests.mount("http://", adapter)
 	def add_api_key(self, kwargs):
 		# Adds the API key to request params, if one was not
 		# already provided outside of the client.
 		if 'data' in kwargs:
 			params = kwargs['data']
 		else:
 			params = {}
 			kwargs['data'] = params
 		if 'api_key' not in params:
 			params['api_key'] = self.api_key
 	def get(self, endpoint: str, *args, **kwargs):
 		self.add_api_key(kwargs)
 		return self.requests.get(self.base_url + endpoint, *args, **kwargs)
 def download_file(client: ItchApiClient, upload_id: int, download_path: str, print_url: bool=False):
 	# No timeouts, chunked uploads, default retry strategy, should be all good?
 	try:
 		with client.get(f"/uploads/{upload_id}/download", stream=True) as r:
 			r.raise_for_status()
 			if print_url:
 				print(f"Download URL: {r.url}")
 			with open(download_path, 'wb') as f:
 				for chunk in r.iter_content(chunk_size=1048576):  # 1MB chunks
 					f.write(chunk)
 	except requests.exceptions.HTTPError as e:
 		raise ItchDownloadError(f"Unrecoverable download error: {e}")
 def get_download_keys(client: ItchApiClient):
 	print("Fetching all download keys...")
 	download_keys = {}
 	page = 1
 	while True:
 		print(f"Downloading page {page}...")
 		try:
 			r = client.get("/profile/owned-keys", data={"page": page}, timeout=15)
 			r.raise_for_status()
 		except Exception as e:
 			print(f"Got error while fetching download keys: {e}")
 			print(f"Let's just pretend this is enough and move on...")
 			break
 		data = r.json()
 		if 'owned_keys' not in data:
 			break  # Assuming we're out of keys already...
 		for key in data['owned_keys']:
 			download_keys[key['game_id']] = key['id']
 		if len(data['owned_keys']) == data['per_page']:
 			page += 1
 		else:
 			break
 	print(f"Fetched {len(download_keys)} download keys.")
 	return download_keys
 def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]:
 	if 'jam_games' not in jam_json:
 		raise Exception("Provided JSON is not a valid itch.io jam JSON.")
 	# Extract (id, url) pairs from all the entries.
-	return [(e['game']['id'], e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]
+	return [(int(e['game']['id']), e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]
-def try_extract_download_links(driver: webdriver.Chrome, title_url: str) -> list[str]:
+def download_jam(path_to_json: str, download_to: str, api_key: str, continue_from: str=None):
 	elems = driver.find_elements(By.CLASS_NAME, "download_btn")
 	if len(elems) == 0:
 		raise NoSuchElementException("No download links found.")
 	cookie = driver.get_cookie("itchio_token")['value']
 	file_ids = [elem.get_attribute("data-upload_id") for elem in elems]
 	file_urls = []
 	for file_id in file_ids:
 		meta_url = f"{title_url}/file/{file_id}"
 		r = requests.post(meta_url, data={"csrf_token": cookie})
 		if r.ok:
 			file_urls.append(r.json()['url'])
 		else:
 			print(f"Error downloading metadata for file {file_id} (status {r.status_code}): {r.text}")
 	print(f"Extracted URLs: {file_urls}")
 	return file_urls
 def download_link(link: str, path: str) -> tuple[bool, str]:
 	r = requests.get(link)
 	if not r.ok:
 		return (False, r.reason)
 	# The bytes we need:
 	content = r.content
 	# Figure out the filename:
 	if 'Content-Disposition' in r.headers:
 		name = r.headers['Content-Disposition']
 		name = name.removeprefix('attachment; filename="').removesuffix('"')
 	else:  # uhhhh random bullshit go, good luck?
 		md5 = hashlib.md5()
 		md5.update(content)
 		name = md5.hexdigest()
 	# Make sure we don't overwrite files with the same name.
 	fcounter = 1
 	filename = f"{path}/{name}"
 	while os.path.exists(filename):
 		fcounter += 1
 		filename = f"{path}/{name}.{fcounter}"
 	try:
 		with open(filename, 'wb') as f:
 			f.write(content)
 	except Exception as e:
 		return (False, f"Cannot write output file: {e}")
 	return (True, "Success")
 def download_files(links, path) -> list[tuple[bool, str]]:
 	if len(links) == 0:
 		print(f"Nothing to download into {path}")
 		return []
 	with Pool(len(links)) as p:
 		results = p.starmap(download_link, [(link, path) for link in links])
 		return results
 def parse_download_results(results, method) -> tuple[ItchDownloadResult, str]:
 	global_success = True
 	for success, reason in results:
 		if not success:
 			print(f"Download failed: {reason}")
 			global_success = False
 	if global_success:
 		return (ItchDownloadResult.SUCCESS, f"Method #{method} successful.")
 	else:
 		return (ItchDownloadResult.FAILURE, f"Method #{method} partially successful (downloads failed).")
 def download_title(title_id: int, title_url: str, download_path: str) -> (ItchDownloadResult, str):
 	options = Options()
 	options.add_argument("--headless")
 	with webdriver.Chrome(options=options) as driver:
 		wait = WebDriverWait(driver, timeout=15)
 		driver.get(title_url)
 		with open(f"{download_path}/index.html", 'w') as f:
 			f.write(driver.page_source)
 		skip_purchase_locator = (By.CLASS_NAME, "direct_download_btn")
 		try:
 			print("Trying method #1: Purchase Workflow")
 			elem = driver.find_element(By.CLASS_NAME, "buy_btn")
 			elem.click()
 			elem = wait.until(EC.presence_of_element_located(skip_purchase_locator))
 			elem.click()
 			wait.until(EC.number_of_windows_to_be(2))
 			time.sleep(1)
 			first_tab = driver.current_window_handle
 			for window_handle in driver.window_handles:
 				if window_handle != first_tab:
 					driver.switch_to.window(window_handle)
 					break
 			# We're now on the main downloads page.
 			download_links = try_extract_download_links(driver, title_url)
 			results = download_files(download_links, download_path)
 			return parse_download_results(results, 1)
 		except TimeoutException:
 			print("Method #1 took too long - sleeping for 1m to avoid ~ mystery funsies ~")
 			time.sleep(60)
 			return ItchDownloadResult.DOWNLOAD_TIMEOUT, "Download timed out"
 		except NoSuchElementException:
 			print("Method #1 failed.")
 		try:
 			print("Trying method #2: Direct Download Workflow")
 			download_links = try_extract_download_links(driver, title_url)
 			results = download_files(download_links, download_path)
 			return parse_download_results(results, 2)
 		except NoSuchElementException:
 			print("Method #2 failed.")
 		print("File links missing/no method able to handle target URL.")
 		return ItchDownloadResult.MISSING_DOWNLOAD, "No download method worked."
 def download_jam(path_to_json: str, continue_from: str=None):
 	try:
 		with open(path_to_json) as f:
 			jam_json = json.load(f)
 	except FileNotFoundError:
 		print(f"File {path_to_json} not found.")
 	except json.decoder.JSONDecodeError:
-		print(F"Provided file is not a valid JSON file.")
+		print(F"Provided entries file is not a valid JSON file.")
 	client = ItchApiClient(ITCH_API, api_key)
 	# Check API key validity:
 	profile_req = client.get("/profile")
 	if not profile_req.ok:
 		print(f"Provided API key appears to be invalid: {profile_req.text}")
 		exit(1)
 	jobs = parse_jobs(jam_json)
 	jobs_successful = []
 	jobs_failed = []
 	download_keys = get_download_keys(client)
 	game_id_to_meta = {}  # dict[game_id: int, (title: str, url: str)]
 	for game_id, title, url in jobs:
 		game_id_to_meta[game_id] = (title, url)
 	failed_game_ids = set()
 	# No "continue from"? Yep, start right away.
 	should_process_jobs = continue_from is None
-	for job in jobs:
+	for game_id, title, url in jobs:
-		game_id, title, url = job
+		label = f"{title} ({game_id})"
 		if not should_process_jobs:
 			if game_id == continue_from:
 				should_process_jobs = True
 			else:
 				continue
-		r = requests.get(f"{url}/data.json")
+		try:
-		if r.status_code != 200:
+			download_path = os.path.join(download_to, slugify(title))
-			print(f"Missing data for {url}, probably invalid")
+			if PEDANTIC_MIRRORING:
-			failed_jobs += url
+				site_mirror_path = os.path.join(download_to, "_sites")
-			continue
+			else:
 				site_mirror_path = os.path.join(download_path, "site")
 			os.makedirs(download_path, exist_ok=True)
 			os.makedirs(site_mirror_path, exist_ok=True)
 		except:
 			raise ItchDownloadError(f"Could not create download directory: {download_path}")
-		download_path = os.path.join(os.getcwd(), slugify(title))
+		print(f"Trying to download {label} to {download_path}")
 		print(f"Trying to download {title} ({game_id}) to {download_path}")
-		if not os.path.isdir(download_path):
+		if WGET_PATH is not None:
-			os.mkdir(download_path)
+			print("Downloading site...")
 			if PEDANTIC_MIRRORING:
 				extra_wget_args = [
 					"--timestamping",
 					"--span-hosts",
 					"--convert-links",
 					"--adjust-extension",
 					"--page-requisites",
 				]
 			else:
 				extra_wget_args = []
 			wget = subprocess.run([
 				WGET_PATH,
 				*extra_wget_args,
 				"--quiet",
 				url
 			], cwd=site_mirror_path)
 			if wget.returncode != 0:
 				print(f"Warning: Site mirroring failed/incomplete.")
 		creds = {}
 		if game_id in download_keys:
 			creds['download_key_id'] = download_keys[game_id]
 			print("Using {creds} for private uploads")
 		game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15)
 		if not game_uploads_req.ok:
 			raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}")
 		game_uploads = game_uploads_req.json()['uploads']
 		print(f"Found {len(game_uploads)} upload(s)")
 		try:
-			status, message = download_title(game_id, url, download_path)
+			for upload in game_uploads:
-			print(f"{title}: {status}, {message}")
+				upload_id = upload['id']
 				file_name = upload['filename']
 				file_size = upload['size']
 				upload_is_external = upload['storage'] == 'external'
-			if status == ItchDownloadResult.SUCCESS:
+				print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...")
-				jobs_successful.append((title, download_path))
+				if upload_is_external:
-			else:
+					print("***********************************************************")
-				jobs_failed.append((status, title, url, message))
+					print("*                                                         *")
-		except Exception as e:
+					print("* WARNING: External storage - downloads will likely fail. *")
-			print(f"Download failed for {title} ({game_id}): {e}")
+					print("*         Check the URL displayed below manually!         *")
-			traceback.print_exc()
+					print("*                                                         *")
 					print("***********************************************************")
 				target_path = os.path.join(download_path, file_name)
 				try:
 					download_file(client, upload_id, target_path, print_url=upload_is_external)
 				except ItchDownloadError as e:
 					jobs_failed.append((game_id, file_name, str(e)))
 					print(f"Download failed for {file_name}: {e}")
 					continue
-	print(f"\nAll done, downloaded files successfully for {len(jobs_successful)} title(s):")
+				try:
-	for title, download_path in jobs_successful:
+					actual_file_size = os.stat(target_path).st_size
-		print(title)
+					if actual_file_size == file_size:
 						jobs_successful.append((game_id, file_name))
 					else:
 						jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}"))
 				except FileNotFoundError:
 					jobs_failed.append((game_id, file_name, "Could not download file"))
-	print(f"\nDownloads failed for {len(jobs_failed)} title(s):")
+			print(f"Done downloading {label}")
-	for status, title, url, message in jobs_failed:
+		except ItchDownloadError as e:
-		print(f"{title} - {url} - {status}: {message}")
+			failed_game_ids.append((game_id, str(e)))
 			print(message)
 			continue
 		except Exception as e:
 			print(f"Critical error while downloading {label}: {e}")
 			failed_game_ids.append((game_id, str(e)))
 			traceback.print_exc()
 			print(message)
 			continue
 	successful_titles = {}
 	for game_id, file_name in jobs_successful:
 		if game_id not in successful_titles:
 			successful_titles[game_id] = [file_name]
 	if any(successful_titles):
 		print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):")
 		for game_id, files in successful_titles.items():
 			print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)")
 	if any(jobs_failed):
 		print(f"\nDownloads failed for {len(jobs_failed)} file(s):")
 		for game_id, file_name, message in jobs_failed:
 			title, url = game_id_to_meta[game_id]
 			print(f"{title} - {file_name} - {message}")
 			print(f"Title URL: {url}")
 	if any(failed_game_ids):
 		print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:")
 		for game_id, message in failed_game_ids:
 			title, url = game_id_to_meta[game_id]
 			print(f"{title} ({game_id}) - {url} - {message}")
 if __name__ == "__main__":
 	parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.")
 	parser.add_argument("entries", help="path to the game jam entries.json file")
-	parser.add_argument("--continue-from", metavar="ID", help="skip all entries until the provided entry ID is found")
+	parser.add_argument("--api-key", metavar="key", required=True, help="itch.io API key from https://itch.io/user/settings/api-keys")
 	parser.add_argument("--download-to", metavar="path", help="directory to save results into (default: current dir)")
 	parser.add_argument("--continue-from", metavar="id", help="skip all entries until the provided entry ID is found")
 	args = parser.parse_args()
 	continue_id = args.continue_from
@ -238,4 +302,9 @@ if __name__ == "__main__":
 			print("ID to continue from must be an integer.")
 			exit(1)
-	download_jam(args.entries, continue_from=continue_id)
+	download_to = os.getcwd()
 	if args.download_to is not None:
 		download_to = os.path.normpath(args.download_to)
 		os.makedirs(download_to)
 	download_jam(args.entries, download_to, args.api_key, continue_from=continue_id)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,2 @@
 selenium
 requests
 python-slugify